Browse Source

feat(http): Check in the http code if we're above the cache limit (#25027)

Armin Ronacher 3 years ago
parent
commit
70219677d8

+ 21 - 10
src/sentry/http.py

@@ -133,6 +133,22 @@ def expose_url(url):
     return url
 
 
+def get_domain_key(url):
+    domain = urlparse(url).netloc
+    return f"source:blacklist:v2:{md5_text(domain).hexdigest()}"
+
+
+def lock_domain(url, error=None):
+    error = dict(error or {})
+    if error.get("type") is None:
+        error["type"] = EventError.UNKNOWN_ERROR
+    if error.get("url") is None:
+        error["url"] = expose_url(url)
+    domain_key = get_domain_key(url)
+    cache.set(domain_key, error, 300)
+    logger.warning("source.disabled", extra=error)
+
+
 def fetch_file(
     url,
     domain_lock_enabled=True,
@@ -148,8 +164,7 @@ def fetch_file(
     """
     # lock down domains that are problematic
     if domain_lock_enabled:
-        domain = urlparse(url).netloc
-        domain_key = f"source:blacklist:v2:{md5_text(domain).hexdigest()}"
+        domain_key = get_domain_key(url)
         domain_result = cache.get(domain_key)
         if domain_result:
             domain_result["url"] = url
@@ -201,19 +216,17 @@ def fetch_file(
             except Exception as exc:
                 logger.debug("Unable to fetch %r", url, exc_info=True)
                 if isinstance(exc, RestrictedIPAddress):
-                    error = {"type": EventError.RESTRICTED_IP, "url": expose_url(url)}
+                    error = {"type": EventError.RESTRICTED_IP}
                 elif isinstance(exc, SuspiciousOperation):
-                    error = {"type": EventError.SECURITY_VIOLATION, "url": expose_url(url)}
+                    error = {"type": EventError.SECURITY_VIOLATION}
                 elif isinstance(exc, (Timeout, ReadTimeout)):
                     error = {
                         "type": EventError.FETCH_TIMEOUT,
-                        "url": expose_url(url),
                         "timeout": settings.SENTRY_SOURCE_FETCH_TIMEOUT,
                     }
                 elif isinstance(exc, OverflowError):
                     error = {
                         "type": EventError.FETCH_TOO_LARGE,
-                        "url": expose_url(url),
                         # We want size in megabytes to format nicely
                         "max_size": float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) / 1024 / 1024,
                     }
@@ -221,16 +234,14 @@ def fetch_file(
                     error = {
                         "type": EventError.FETCH_GENERIC_ERROR,
                         "value": str(type(exc)),
-                        "url": expose_url(url),
                     }
                 else:
                     logger.exception(str(exc))
-                    error = {"type": EventError.UNKNOWN_ERROR, "url": expose_url(url)}
+                    error = {"type": EventError.UNKNOWN_ERROR}
 
                 # TODO(dcramer): we want to be less aggressive on disabling domains
                 if domain_lock_enabled:
-                    cache.set(domain_key, error or "", 300)
-                    logger.warning("source.disabled", extra=error)
+                    lock_domain(url, error)
                 raise CannotFetch(error)
 
             headers = {k.lower(): v for k, v in response.headers.items()}

+ 12 - 0
src/sentry/lang/javascript/processor.py

@@ -416,6 +416,18 @@ def fetch_file(url, project=None, release=None, dist=None, allow_scraping=True):
                 get_max_age(result.headers),
             )
 
+            # since the cache.set above can fail we can end up in a situation
+            # where the file is too large for the cache. In that case we abort
+            # the fetch and cache a failure and lock the domain for future
+            # http fetches.
+            if cache.get(cache_key) is None:
+                error = {
+                    "type": EventError.TOO_LARGE_FOR_CACHE,
+                    "url": http.expose_url(url),
+                }
+                http.lock_domain(url, error=error)
+                raise http.CannotFetch(error)
+
     # If we did not get a 200 OK we just raise a cannot fetch here.
     if result.status != 200:
         raise http.CannotFetch(

+ 3 - 1
src/sentry/models/eventerror.py

@@ -20,6 +20,7 @@ class EventError:
     FETCH_INVALID_ENCODING = "fetch_invalid_source_encoding"
     FETCH_TOO_LARGE = "fetch_too_large"
     FETCH_TIMEOUT = "fetch_timeout"
+    TOO_LARGE_FOR_CACHE = "too_large_for_cache"
 
     # Processing: JavaScript
     JS_GENERIC_FETCH_ERROR = "js_generic_fetch_error"  # deprecated in favor of FETCH_GENERIC_ERROR
@@ -65,8 +66,9 @@ class EventError:
         FETCH_GENERIC_ERROR: "Unable to fetch HTTP resource",
         FETCH_INVALID_HTTP_CODE: "HTTP returned error response",
         FETCH_INVALID_ENCODING: "Source file was not encoded properly",
-        FETCH_TOO_LARGE: "Remote file too large",
+        FETCH_TOO_LARGE: "Remote file too large for downloading",
         FETCH_TIMEOUT: "Remote file took too long to load",
+        TOO_LARGE_FOR_CACHE: "Remote file too large for caching",
         JS_GENERIC_FETCH_ERROR: "Unable to fetch resource",
         JS_INVALID_HTTP_CODE: "HTTP returned error response",
         JS_INVALID_CONTENT: "Source file was not JavaScript",

+ 29 - 0
tests/sentry/lang/javascript/test_processor.py

@@ -515,6 +515,35 @@ class FetchFileTest(TestCase):
 
         assert result == result2
 
+    @responses.activate
+    def test_too_large_for_cache(self):
+        # make the cache fail
+        domain_key = http.get_domain_key("http://example.com")
+
+        original_get = cache.get
+
+        def cache_get(key):
+            if key == domain_key:
+                return original_get(key)
+
+        with patch("sentry.utils.cache.cache.get", side_effect=cache_get):
+            responses.add(
+                responses.GET,
+                "http://example.com",
+                body=b"Stuff",
+                content_type="application/json; charset=utf-8",
+            )
+
+            with pytest.raises(http.CannotFetch) as exc:
+                fetch_file("http://example.com")
+
+            assert exc.value.data["type"] == EventError.TOO_LARGE_FOR_CACHE
+
+            assert cache.get(domain_key) == {
+                "type": "too_large_for_cache",
+                "url": "http://example.com",
+            }
+
     @responses.activate
     def test_truncated(self):
         url = truncatechars("http://example.com", 3)