Browse Source

javascript: ignore querystrings when looking up artifacts

Matt Robenolt 7 years ago
parent
commit
a612fd6d14

+ 2 - 0
CHANGES

@@ -1,6 +1,8 @@
 Version 8.21 (Unreleased)
 -------------------------
 
+- Ignore querystrings when looking up release artifacts
+
 Version 8.20
 ------------
 - Make BitBucket repositories enabled by default

+ 13 - 18
src/sentry/lang/javascript/processor.py

@@ -11,7 +11,7 @@ import zlib
 from django.conf import settings
 from os.path import splitext
 from requests.utils import get_encoding_from_headers
-from six.moves.urllib.parse import urlparse, urljoin, urlsplit
+from six.moves.urllib.parse import urljoin, urlsplit
 from libsourcemap import from_json as view_from_json
 
 # In case SSL is unavailable (light builds) we can't import this here.
@@ -197,29 +197,19 @@ def discover_sourcemap(result):
 def fetch_release_file(filename, release, dist=None):
     cache_key = 'releasefile:v1:%s:%s' % (release.id, md5_text(filename).hexdigest(), )
 
-    filename_path = None
-    if filename is not None:
-        # Reconstruct url without protocol + host
-        # e.g. http://example.com/foo?bar => ~/foo?bar
-        parsed_url = urlparse(filename)
-        filename_path = '~' + parsed_url.path
-        if parsed_url.query:
-            filename_path += '?' + parsed_url.query
-
     logger.debug('Checking cache for release artifact %r (release_id=%s)', filename, release.id)
     result = cache.get(cache_key)
 
     dist_name = dist and dist.name or None
 
     if result is None:
+        filename_choices = ReleaseFile.normalize(filename)
+        filename_idents = [ReleaseFile.get_ident(f, dist_name) for f in filename_choices]
+
         logger.debug(
             'Checking database for release artifact %r (release_id=%s)', filename, release.id
         )
 
-        filename_idents = [ReleaseFile.get_ident(filename, dist_name)]
-        if filename_path is not None and filename_path != filename:
-            filename_idents.append(ReleaseFile.get_ident(filename_path, dist_name))
-
         possible_files = list(
             ReleaseFile.objects.filter(
                 release=release,
@@ -237,10 +227,15 @@ def fetch_release_file(filename, release, dist=None):
         elif len(possible_files) == 1:
             releasefile = possible_files[0]
         else:
-            # Prioritize releasefile that matches full url (w/ host)
-            # over hostless releasefile
-            target_ident = filename_idents[0]
-            releasefile = next((f for f in possible_files if f.ident == target_ident))
+            # Pick first one that matches in priority order.
+            # This is O(N*M) but there are only ever at most 4 things here
+            # so not really worth optimizing.
+            releasefile = next((
+                rf
+                for ident in filename_idents
+                for rf in possible_files
+                if rf.ident == ident
+            ))
 
         logger.debug(
             'Found release artifact %r (id=%s, release_id=%s)', filename, releasefile.id, release.id

+ 23 - 0
src/sentry/models/releasefile.py

@@ -9,6 +9,7 @@ sentry.models.releasefile
 from __future__ import absolute_import
 
 from django.db import models
+from six.moves.urllib.parse import urlsplit, urlunsplit
 
 from sentry.db.models import BoundedPositiveIntegerField, FlexibleForeignKey, Model, sane_repr
 from sentry.utils.hashlib import sha1_text
@@ -59,3 +60,25 @@ class ReleaseFile(Model):
         if dist is not None:
             return sha1_text(name + '\x00\x00' + dist).hexdigest()
         return sha1_text(name).hexdigest()
+
+    @classmethod
+    def normalize(cls, url):
+        """Transforms a full absolute url into 2 or 4 generalized options
+
+        * the original url as input
+        * (optional) original url without querystring
+        * the full url, but stripped of scheme and netloc
+        * (optional) full url without scheme and netloc or querystring
+        """
+        # Always ignore the fragment
+        scheme, netloc, path, query, _ = urlsplit(url)
+        uri_relative = (None, None, path, query, None)
+        uri_without_query = (scheme, netloc, path, None, None)
+        uri_relative_without_query = (None, None, path, None, None)
+        urls = [url]
+        if query:
+            urls.append(urlunsplit(uri_without_query))
+        urls.append('~' + urlunsplit(uri_relative))
+        if query:
+            urls.append('~' + urlunsplit(uri_relative_without_query))
+        return urls

+ 35 - 0
tests/sentry/lang/javascript/test_processor.py

@@ -130,6 +130,41 @@ class FetchReleaseFileTest(TestCase):
 
         assert result == new_result
 
+    def test_fallbacks(self):
+        project = self.project
+        release = Release.objects.create(
+            organization_id=project.organization_id,
+            version='abc',
+        )
+        release.add_project(project)
+
+        file = File.objects.create(
+            name='~/file.min.js',
+            type='release.file',
+            headers={'Content-Type': 'application/json; charset=utf-8'},
+        )
+
+        binary_body = unicode_body.encode('utf-8')
+        file.putfile(six.BytesIO(binary_body))
+
+        ReleaseFile.objects.create(
+            name='~/file.min.js',
+            release=release,
+            organization_id=project.organization_id,
+            file=file,
+        )
+
+        result = fetch_release_file('http://example.com/file.min.js?lol', release)
+
+        assert type(result.body) is six.binary_type
+        assert result == http.UrlResult(
+            'http://example.com/file.min.js?lol',
+            {'content-type': 'application/json; charset=utf-8'},
+            binary_body,
+            200,
+            'utf-8',
+        )
+
 
 class FetchFileTest(TestCase):
     @responses.activate

+ 36 - 0
tests/sentry/models/test_releasefile.py

@@ -0,0 +1,36 @@
+from __future__ import absolute_import
+
+from sentry.models import ReleaseFile
+from sentry.testutils import TestCase
+
+
+class ReleaseFileTestCase(TestCase):
+    def test_normalize(self):
+        n = ReleaseFile.normalize
+
+        assert n('http://example.com') == [
+            'http://example.com',
+            '~',
+        ]
+        assert n('http://example.com/foo.js') == [
+            'http://example.com/foo.js',
+            '~/foo.js',
+        ]
+        assert n('http://example.com/foo.js?bar') == [
+            'http://example.com/foo.js?bar',
+            'http://example.com/foo.js',
+            '~/foo.js?bar',
+            '~/foo.js',
+        ]
+        assert n('/foo.js') == [
+            '/foo.js',
+            '~/foo.js',
+        ]
+
+        # This is the current behavior, but seems weird to me.
+        # unclear if we actually experience this case in the real
+        # world, but worth documenting the behavior
+        assert n('foo.js') == [
+            'foo.js',
+            '~foo.js',
+        ]