Browse Source

fix(http-interface): Improve request body decoding (#6064)

This updates the request interface to handle decoding the HTTP body by
attempting JSON and URL decoding, then marking the request with the
'inferred' type that it was able to decode the body as, discounting the
content-type header.

This corrects handling of various situations:

 - When an object is passed as the data in the request context http
   interface would immediately assume it must be JSON and store it in
   the data field as a string, later in the interface it is rendered as
   JSON.

   See GH-5673

 - We attempted to always parse JSON from the data on the client side no
   matter the Content-Type header. This improves that logic by doing the
   parsing on the server side in the http interface and adding support
   for url encoded string decoding.

   This correction fixes body sanitization as we can now say with a
   higher degree of certainty that the body will have been decoded
   correctly before passing through sanitization.

   See GH-5599

 - Inferring the content type also allows us to more accurately render
   the data in the interface.

Fixes GH-5673
Fixes GH-5599
Evan Purkhiser 7 years ago
parent
commit
17dca368e4

+ 3 - 0
CHANGES

@@ -19,6 +19,9 @@ Version 8.20
 - Add initial support for Redis Cluster.
 - Support a list of hosts in the ``redis.clusters`` configuration along side
   the traditional dictionary style configuration.
+- Better support for rendering rich JSON and URL encoded HTTP bodies by
+  guessing the content type based on format heuristics.
+- Better support for sanitizing of string HTTP bodies.
 
 Schema Changes
 ~~~~~~~~~~~~~~

+ 21 - 8
src/sentry/interfaces/http.py

@@ -18,8 +18,8 @@ from django.utils.translation import ugettext as _
 from six.moves.urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
 
 from sentry.interfaces.base import Interface, InterfaceValidationError
-from sentry.utils import json
 from sentry.utils.safe import trim, trim_dict, trim_pairs
+from sentry.utils.http import heuristic_decode
 from sentry.web.helpers import render_to_string
 
 # Instead of relying on a list of hardcoded methods, just loosly match
@@ -167,13 +167,29 @@ class Http(Interface):
         else:
             headers = ()
 
+        # We prefer the body to be a string, since we can then attempt to parse it
+        # as JSON OR decode it as a URL encoded query string, without relying on
+        # the correct content type header being passed.
         body = data.get('data')
-        if isinstance(body, dict):
-            body = json.dumps(body)
+
+        content_type = next((v for k, v in headers if k == 'Content-Type'), None)
+
+        # Remove content type parameters
+        if content_type is not None:
+            content_type = content_type.partition(';')[0].rstrip()
+
+        # We process request data once during ingestion and again when
+        # requesting the http interface over the API. Avoid overwriting
+        # decoding the body again.
+        inferred_content_type = data.get('inferred_content_type', content_type)
+
+        if 'inferred_content_type' not in data and not isinstance(body, dict):
+            body, inferred_content_type = heuristic_decode(body, content_type)
 
         if body:
             body = trim(body, settings.SENTRY_MAX_HTTP_BODY_SIZE)
 
+        kwargs['inferred_content_type'] = inferred_content_type
         kwargs['cookies'] = trim_pairs(format_cookies(cookies))
         kwargs['env'] = trim_dict(data.get('env') or {})
         kwargs['headers'] = trim_pairs(headers)
@@ -217,10 +233,6 @@ class Http(Interface):
         if is_public:
             return {}
 
-        data = self.data
-        if isinstance(data, dict):
-            data = json.dumps(data)
-
         cookies = self.cookies or ()
         if isinstance(cookies, dict):
             cookies = sorted(self.cookies.items())
@@ -234,9 +246,10 @@ class Http(Interface):
             'url': self.url,
             'query': self.query_string,
             'fragment': self.fragment,
-            'data': data,
+            'data': self.data,
             'headers': headers,
             'cookies': cookies,
             'env': self.env or None,
+            'inferredContentType': self.inferred_content_type,
         }
         return data

+ 11 - 48
src/sentry/static/sentry/app/components/events/interfaces/richHttpContent.jsx

@@ -6,6 +6,7 @@ import ClippedBox from '../../clippedBox';
 import KeyValueList from './keyValueList';
 import ContextData from '../../contextData';
 
+import {objectToSortedTupleArray} from './utils';
 import {objectIsEmpty} from '../../../utils';
 import {t} from '../../../locale';
 
@@ -14,51 +15,15 @@ const RichHttpContent = React.createClass({
     data: PropTypes.object.isRequired
   },
 
-  /**
-   * Converts an object of body/querystring key/value pairs
-   * into a tuple of [key, value] pairs, and sorts them.
-   *
-   * Note that the query-string parser returns dupes like this:
-   *   { foo: ['bar', 'baz'] } // ?foo=bar&bar=baz
-   *
-   * This method accounts for this.
-   */
-  objectToSortedTupleArray(obj) {
-    return Object.keys(obj)
-      .reduce((out, k) => {
-        let val = obj[k];
-        return out.concat(
-          {}.toString.call(val) === '[object Array]'
-            ? val.map(v => [k, v]) // key has multiple values (array)
-            : [[k, val]] // key has single value
-        );
-      }, [])
-      .sort(function([keyA, valA], [keyB, valB]) {
-        // if keys are identical, sort on value
-        if (keyA === keyB) {
-          return valA < valB ? -1 : 1;
-        }
-
-        return keyA < keyB ? -1 : 1;
-      });
-  },
-
   getBodySection(data) {
-    /*eslint no-empty:0*/
-    let contentType = data.headers.find(h => h[0] === 'Content-Type');
-    contentType = contentType && contentType[1].split(';')[0].toLowerCase();
-
-    // Ignoring Content-Type, we immediately just check if the body is parseable
-    // as JSON. Why? Because many applications don't set proper Content-Type values,
-    // e.g. x-www-form-urlencoded  actually contains JSON.
-    try {
-      return <ContextData data={JSON.parse(data.data)} />;
-    } catch (e) {}
-
-    if (contentType === 'application/x-www-form-urlencoded') {
-      return this.getQueryStringOrRaw(data.data);
-    } else {
-      return <pre>{JSON.stringify(data.data, null, 2)}</pre>;
+    // The http interface provides an inferred content type for the data body.
+    switch (data.inferredContentType) {
+      case 'application/json':
+        return <ContextData data={data.data} />;
+      case 'application/x-www-form-urlencoded':
+        return <KeyValueList data={objectToSortedTupleArray(data.data)} />;
+      default:
+        return <pre>{JSON.stringify(data.data, null, 2)}</pre>;
     }
   },
 
@@ -66,9 +31,7 @@ const RichHttpContent = React.createClass({
     try {
       // Sentry API abbreviates long query string values, sometimes resulting in
       // an un-parsable querystring ... stay safe kids
-      return (
-        <KeyValueList data={this.objectToSortedTupleArray(queryString.parse(data))} />
-      );
+      return <KeyValueList data={objectToSortedTupleArray(queryString.parse(data))} />;
     } catch (e) {
       return <pre>{data}</pre>;
     }
@@ -103,7 +66,7 @@ const RichHttpContent = React.createClass({
           </ClippedBox>}
         {!objectIsEmpty(data.env) &&
           <ClippedBox title={t('Environment')} defaultCollapsed>
-            <KeyValueList data={this.objectToSortedTupleArray(data.env)} />
+            <KeyValueList data={objectToSortedTupleArray(data.env)} />
           </ClippedBox>}
       </div>
     );

+ 39 - 5
src/sentry/static/sentry/app/components/events/interfaces/utils.jsx

@@ -1,4 +1,3 @@
-import _ from 'lodash';
 import {defined} from '../../../utils';
 
 export function escapeQuotes(v) {
@@ -30,10 +29,15 @@ export function getCurlCommand(data) {
     result += ' \\\n -H "' + header[0] + ': ' + escapeQuotes(header[1] + '') + '"';
   }
 
-  if (_.isString(data.data)) {
-    result += ' \\\n --data "' + escapeQuotes(data.data) + '"';
-  } else if (defined(data.data)) {
-    result += ' \\\n --data "' + escapeQuotes(jQuery.param(data.data)) + '"';
+  switch (data.inferredContentType) {
+    case 'application/json':
+      result += ' \\\n --data "' + escapeQuotes(JSON.stringify(data.data)) + '"';
+      break;
+    case 'application/x-www-form-urlencoded':
+      result += ' \\\n --data "' + escapeQuotes(jQuery.param(data.data)) + '"';
+      break;
+    default:
+      result += ' \\\n --data "' + escapeQuotes(data.data) + '"';
   }
 
   result += ' \\\n "' + data.url;
@@ -45,3 +49,33 @@ export function getCurlCommand(data) {
   result += '"';
   return result;
 }
+
+/**
+ * Converts an object of body/querystring key/value pairs
+ * into a tuple of [key, value] pairs, and sorts them.
+ *
+ * This handles the case for query strings that were decoded like so:
+ *
+ *   ?foo=bar&foo=baz => { foo: ['bar', 'baz'] }
+ *
+ * By converting them to [['foo', 'bar'], ['foo', 'baz']]
+ */
+export function objectToSortedTupleArray(obj) {
+  return Object.keys(obj)
+    .reduce((out, k) => {
+      let val = obj[k];
+      return out.concat(
+        {}.toString.call(val) === '[object Array]'
+          ? val.map(v => [k, v]) // key has multiple values (array)
+          : [[k, val]] // key has single value
+      );
+    }, [])
+    .sort(function([keyA, valA], [keyB, valB]) {
+      // if keys are identical, sort on value
+      if (keyA === keyB) {
+        return valA < valB ? -1 : 1;
+      }
+
+      return keyA < keyB ? -1 : 1;
+    });
+}

+ 2 - 0
src/sentry/utils/data_scrubber.py

@@ -161,6 +161,8 @@ class SensitiveDataFilter(object):
 
                 data[n] = '&'.join('='.join(k) for k in querybits)
             else:
+                # Encoded structured data (HTTP bodies, headers) would have
+                # already been decoded by the request interface.
                 data[n] = varmap(self.sanitize, data[n])
 
     def filter_user(self, data):

+ 35 - 1
src/sentry/utils/http.py

@@ -11,9 +11,11 @@ import six
 
 from collections import namedtuple
 from django.conf import settings
-from six.moves.urllib.parse import urlencode, urljoin, urlparse
+from six.moves.urllib.parse import parse_qs, urlencode, urljoin, urlparse
+from functools import partial
 
 from sentry import options
+from sentry.utils import json
 
 ParsedUriMatch = namedtuple('ParsedUriMatch', ['scheme', 'domain', 'path'])
 
@@ -222,3 +224,35 @@ def origin_from_request(request):
     if rv in ('', 'null'):
         rv = origin_from_url(request.META.get('HTTP_REFERER'))
     return rv
+
+
+def heuristic_decode(data, possible_content_type=None):
+    """
+    Attempt to decode a HTTP body by trying JSON and Form URL decoders,
+    returning the decoded body (if decoding was successful) and the inferred
+    content type.
+    """
+    inferred_content_type = possible_content_type
+
+    form_encoded_parser = partial(
+        parse_qs,
+        strict_parsing=True,
+        keep_blank_values=True,
+    )
+
+    decoders = [
+        ('application/x-www-form-urlencoded', form_encoded_parser),
+        ('application/json', json.loads),
+    ]
+
+    # Prioritize the decoder which supports the possible content type first.
+    decoders.sort(key=lambda d: d[0] == possible_content_type, reverse=True)
+
+    for decoding_type, decoder in decoders:
+        try:
+            return (decoder(data), decoding_type)
+        except Exception:
+            # Try another decoder
+            continue
+
+    return (data, inferred_content_type)

+ 9 - 70
tests/js/spec/components/events/interfaces/richHttpContent.spec.jsx

@@ -20,50 +20,20 @@ describe('RichHttpContent', function() {
     this.sandbox.restore();
   });
 
-  describe('objectToSortedTupleArray', function() {
-    it('should convert a key/value object to a sorted array of key/value tuples', function() {
-      let elem = this.elem;
-      // expect(
-      //   elem.objectToSortedTupleArray({
-      //     awe: 'some',
-      //     foo: 'bar',
-      //     bar: 'baz'
-      //   })
-      // ).toEqual([
-      //   // note sorted alphabetically by key
-      //   ['awe', 'some'],
-      //   ['bar', 'baz'],
-      //   ['foo', 'bar']
-      // ]);
-
-      expect(
-        elem.objectToSortedTupleArray({
-          foo: ['bar', 'baz']
-        })
-      ).toEqual([['foo', 'bar'], ['foo', 'baz']]);
-
-      // expect(
-      //   elem.objectToSortedTupleArray({
-      //     foo: ''
-      //   })
-      // ).toEqual([['foo', '']]);
-    });
-  });
-
   describe('getBodySection', function() {
-    it('should return plain-text when unrecognized Content-Type and not parsable as JSON', function() {
+    it('should return plain-text when given unrecognized inferred Content-Type', function() {
       let out = this.elem.getBodySection({
-        headers: [], // no content-type header,
+        inferredContentType: null, // no inferred content type
         data: 'helloworld'
       });
 
       expect(out.type).toEqual('pre');
     });
 
-    it('should return a KeyValueList element when Content-Type is x-www-form-urlencoded', function() {
+    it('should return a KeyValueList element when inferred Content-Type is x-www-form-urlencoded', function() {
       let out = this.elem.getBodySection({
-        headers: [['lol', 'no'], ['Content-Type', 'application/x-www-form-urlencoded']], // no content-type header,
-        data: 'foo=bar&bar=baz'
+        inferredContentType: 'application/x-www-form-urlencoded',
+        data: {foo: ['bar'], bar: ['baz']}
       });
 
       // NOTE: ContextData is stubbed in tests; instead returns <div className="ContextData"/>
@@ -71,19 +41,10 @@ describe('RichHttpContent', function() {
       expect(out.props.data).toEqual([['bar', 'baz'], ['foo', 'bar']]);
     });
 
-    it('should return plain-text when Content-Type is x-www-form-urlencoded and query string cannot be parsed', function() {
-      let out = this.elem.getBodySection({
-        headers: [['Content-Type', 'application/x-www-form-urlencoded']],
-        data: 'foo=hello%2...' // note: broken URL encoded value (%2 vs %2F)
-      });
-
-      expect(out.type).toEqual('pre');
-    });
-
-    it('should return a ContextData element when Content-Type is application/json', function() {
+    it('should return a ContextData element when inferred Content-Type is application/json', function() {
       let out = this.elem.getBodySection({
-        headers: [['lol', 'no'], ['Content-Type', 'application/json']], // no content-type header,
-        data: JSON.stringify({foo: 'bar'})
+        inferredContentType: 'application/json',
+        data: {foo: 'bar'}
       });
 
       // NOTE: ContextData is stubbed in tests; instead returns <div className="ContextData"/>
@@ -93,29 +54,7 @@ describe('RichHttpContent', function() {
       });
     });
 
-    it('should return a ContextData element when content is JSON, ignoring Content-Type', function() {
-      let out = this.elem.getBodySection({
-        headers: [['Content-Type', 'application/x-www-form-urlencoded']], // no content-type header,
-        data: JSON.stringify({foo: 'bar'})
-      });
-
-      // NOTE: ContextData is stubbed in tests; instead returns <div className="ContextData"/>
-      expect(out.type.displayName).toEqual('ContextData');
-      expect(out.props.data).toEqual({
-        foo: 'bar'
-      });
-    });
-
-    it('should return plain-text when JSON is not parsable', function() {
-      let out = this.elem.getBodySection({
-        headers: [['lol', 'no'], ['Content-Type', 'application/json']],
-        data: 'lol not json'
-      });
-
-      expect(out.type).toEqual('pre');
-    });
-
-    it('should now blow up in a malformed uri', function() {
+    it('should not blow up in a malformed uri', function() {
       // > decodeURIComponent('a%AFc')
       // URIError: URI malformed
       let data = {

+ 33 - 1
tests/js/spec/components/events/interfaces/utils.spec.jsx

@@ -1,4 +1,7 @@
-import {getCurlCommand} from 'app/components/events/interfaces/utils';
+import {
+  getCurlCommand,
+  objectToSortedTupleArray
+} from 'app/components/events/interfaces/utils';
 
 describe('components/interfaces/utils', function() {
   describe('getCurlCommand()', function() {
@@ -60,4 +63,33 @@ describe('components/interfaces/utils', function() {
       );
     });
   });
+
+  describe('objectToSortedTupleArray()', function() {
+    it('should convert a key/value object to a sorted array of key/value tuples', function() {
+      // expect(
+      //   objectToSortedTupleArray({
+      //     awe: 'some',
+      //     foo: 'bar',
+      //     bar: 'baz'
+      //   })
+      // ).toEqual([
+      //   // note sorted alphabetically by key
+      //   ['awe', 'some'],
+      //   ['bar', 'baz'],
+      //   ['foo', 'bar']
+      // ]);
+
+      expect(
+        objectToSortedTupleArray({
+          foo: ['bar', 'baz']
+        })
+      ).toEqual([['foo', 'bar'], ['foo', 'baz']]);
+
+      // expect(
+      //   objectToSortedTupleArray({
+      //     foo: ''
+      //   })
+      // ).toEqual([['foo', '']]);
+    });
+  });
 });

+ 2 - 2
tests/sentry/interfaces/test_http.py

@@ -77,7 +77,7 @@ class HttpTest(TestCase):
             url='http://example.com',
             data={'foo': 'bar'},
         ))
-        assert result.data == '{"foo":"bar"}'
+        assert result.data == {'foo': 'bar'}
 
     def test_form_encoded_data(self):
         result = Http.to_python(
@@ -87,7 +87,7 @@ class HttpTest(TestCase):
                 data='foo=bar',
             )
         )
-        assert result.data == 'foo=bar'
+        assert result.data == {'foo': ['bar']}
 
     def test_cookies_as_string(self):
         result = Http.to_python(dict(

+ 35 - 0
tests/sentry/utils/http/tests.py

@@ -16,6 +16,7 @@ from sentry.utils.http import (
     get_origins,
     absolute_uri,
     origin_from_request,
+    heuristic_decode,
 )
 from sentry.utils.data_filters import (
     is_valid_ip,
@@ -335,3 +336,37 @@ class OriginFromRequestTestCase(TestCase):
 
         request.META['HTTP_REFERER'] = 'http://example.com'
         assert origin_from_request(request) == 'http://example.com'
+
+
+class HeuristicDecodeTestCase(TestCase):
+    json_body = '{"key": "value", "key2": "value2"}'
+    url_body = 'key=value&key2=value2'
+
+    def test_json(self):
+        data, content_type = heuristic_decode(self.json_body, 'application/json')
+        assert data == {'key': 'value', 'key2': 'value2'}
+        assert content_type == 'application/json'
+
+    def test_url_encoded(self):
+        data, content_type = heuristic_decode(self.url_body, 'application/x-www-form-urlencoded')
+        assert data == {'key': ['value'], 'key2': ['value2']}
+        assert content_type == 'application/x-www-form-urlencoded'
+
+    def test_possible_type_mismatch(self):
+        data, content_type = heuristic_decode(self.json_body, 'application/x-www-form-urlencoded')
+        assert data == {'key': 'value', 'key2': 'value2'}
+        assert content_type == 'application/json'
+
+        data, content_type = heuristic_decode(self.url_body, 'application/json')
+        assert data == {'key': ['value'], 'key2': ['value2']}
+        assert content_type == 'application/x-www-form-urlencoded'
+
+    def test_no_possible_type(self):
+        data, content_type = heuristic_decode(self.json_body)
+        assert data == {'key': 'value', 'key2': 'value2'}
+        assert content_type == 'application/json'
+
+    def test_unable_to_decode(self):
+        data, content_type = heuristic_decode('string body', 'text/plain')
+        assert data == 'string body'
+        assert content_type == 'text/plain'

Some files were not shown because too many files changed in this diff