3 years ago · 70219677d8
--- a/src/sentry/http.py
+++ b/src/sentry/http.py
@@ -133,6 +133,22 @@ def expose_url(url):
 
				     return url
			
 
				 
			
 
				 
			
 
				+def get_domain_key(url):
			
 
				+    domain = urlparse(url).netloc
			
 
				+    return f"source:blacklist:v2:{md5_text(domain).hexdigest()}"
			
 
				+
			
 
				+
			
 
				+def lock_domain(url, error=None):
			
 
				+    error = dict(error or {})
			
 
				+    if error.get("type") is None:
			
 
				+        error["type"] = EventError.UNKNOWN_ERROR
			
 
				+    if error.get("url") is None:
			
 
				+        error["url"] = expose_url(url)
			
 
				+    domain_key = get_domain_key(url)
			
 
				+    cache.set(domain_key, error, 300)
			
 
				+    logger.warning("source.disabled", extra=error)
			
 
				+
			
 
				+
			
 
				 def fetch_file(
			
 
				     url,
			
 
				     domain_lock_enabled=True,
			
@@ -148,8 +164,7 @@ def fetch_file(
 
				     """
			
 
				     # lock down domains that are problematic
			
 
				     if domain_lock_enabled:
			
 
				-        domain = urlparse(url).netloc
			
 
				-        domain_key = f"source:blacklist:v2:{md5_text(domain).hexdigest()}"
			
 
				+        domain_key = get_domain_key(url)
			
 
				         domain_result = cache.get(domain_key)
			
 
				         if domain_result:
			
 
				             domain_result["url"] = url
			
@@ -201,19 +216,17 @@ def fetch_file(
 
				             except Exception as exc:
			
 
				                 logger.debug("Unable to fetch %r", url, exc_info=True)
			
 
				                 if isinstance(exc, RestrictedIPAddress):
			
 
				-                    error = {"type": EventError.RESTRICTED_IP, "url": expose_url(url)}
			
 
				+                    error = {"type": EventError.RESTRICTED_IP}
			
 
				                 elif isinstance(exc, SuspiciousOperation):
			
 
				-                    error = {"type": EventError.SECURITY_VIOLATION, "url": expose_url(url)}
			
 
				+                    error = {"type": EventError.SECURITY_VIOLATION}
			
 
				                 elif isinstance(exc, (Timeout, ReadTimeout)):
			
 
				                     error = {
			
 
				                         "type": EventError.FETCH_TIMEOUT,
			
 
				-                        "url": expose_url(url),
			
 
				                         "timeout": settings.SENTRY_SOURCE_FETCH_TIMEOUT,
			
 
				                     }
			
 
				                 elif isinstance(exc, OverflowError):
			
 
				                     error = {
			
 
				                         "type": EventError.FETCH_TOO_LARGE,
			
 
				-                        "url": expose_url(url),
			
 
				                         # We want size in megabytes to format nicely
			
 
				                         "max_size": float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) / 1024 / 1024,
			
 
				                     }
			
@@ -221,16 +234,14 @@ def fetch_file(
 
				                     error = {
			
 
				                         "type": EventError.FETCH_GENERIC_ERROR,
			
 
				                         "value": str(type(exc)),
			
 
				-                        "url": expose_url(url),
			
 
				                     }
			
 
				                 else:
			
 
				                     logger.exception(str(exc))
			
 
				-                    error = {"type": EventError.UNKNOWN_ERROR, "url": expose_url(url)}
			
 
				+                    error = {"type": EventError.UNKNOWN_ERROR}
			
 
				 
			
 
				                 # TODO(dcramer): we want to be less aggressive on disabling domains
			
 
				                 if domain_lock_enabled:
			
 
				-                    cache.set(domain_key, error or "", 300)
			
 
				-                    logger.warning("source.disabled", extra=error)
			
 
				+                    lock_domain(url, error)
			
 
				                 raise CannotFetch(error)
			
 
				 
			
 
				             headers = {k.lower(): v for k, v in response.headers.items()}
			
--- a/src/sentry/lang/javascript/processor.py
+++ b/src/sentry/lang/javascript/processor.py
@@ -416,6 +416,18 @@ def fetch_file(url, project=None, release=None, dist=None, allow_scraping=True):
 
				                 get_max_age(result.headers),
			
 
				             )
			
 
				 
			
 
				+            # since the cache.set above can fail we can end up in a situation
			
 
				+            # where the file is too large for the cache. In that case we abort
			
 
				+            # the fetch and cache a failure and lock the domain for future
			
 
				+            # http fetches.
			
 
				+            if cache.get(cache_key) is None:
			
 
				+                error = {
			
 
				+                    "type": EventError.TOO_LARGE_FOR_CACHE,
			
 
				+                    "url": http.expose_url(url),
			
 
				+                }
			
 
				+                http.lock_domain(url, error=error)
			
 
				+                raise http.CannotFetch(error)
			
 
				+
			
 
				     # If we did not get a 200 OK we just raise a cannot fetch here.
			
 
				     if result.status != 200:
			
 
				         raise http.CannotFetch(
			
--- a/src/sentry/models/eventerror.py
+++ b/src/sentry/models/eventerror.py
@@ -20,6 +20,7 @@ class EventError:
 
				     FETCH_INVALID_ENCODING = "fetch_invalid_source_encoding"
			
 
				     FETCH_TOO_LARGE = "fetch_too_large"
			
 
				     FETCH_TIMEOUT = "fetch_timeout"
			
 
				+    TOO_LARGE_FOR_CACHE = "too_large_for_cache"
			
 
				 
			
 
				     # Processing: JavaScript
			
 
				     JS_GENERIC_FETCH_ERROR = "js_generic_fetch_error"  # deprecated in favor of FETCH_GENERIC_ERROR
			
@@ -65,8 +66,9 @@ class EventError:
 
				         FETCH_GENERIC_ERROR: "Unable to fetch HTTP resource",
			
 
				         FETCH_INVALID_HTTP_CODE: "HTTP returned error response",
			
 
				         FETCH_INVALID_ENCODING: "Source file was not encoded properly",
			
 
				-        FETCH_TOO_LARGE: "Remote file too large",
			
 
				+        FETCH_TOO_LARGE: "Remote file too large for downloading",
			
 
				         FETCH_TIMEOUT: "Remote file took too long to load",
			
 
				+        TOO_LARGE_FOR_CACHE: "Remote file too large for caching",
			
 
				         JS_GENERIC_FETCH_ERROR: "Unable to fetch resource",
			
 
				         JS_INVALID_HTTP_CODE: "HTTP returned error response",
			
 
				         JS_INVALID_CONTENT: "Source file was not JavaScript",
			
--- a/tests/sentry/lang/javascript/test_processor.py
+++ b/tests/sentry/lang/javascript/test_processor.py
@@ -515,6 +515,35 @@ class FetchFileTest(TestCase):
 
				 
			
 
				         assert result == result2
			
 
				 
			
 
				+    @responses.activate
			
 
				+    def test_too_large_for_cache(self):
			
 
				+        # make the cache fail
			
 
				+        domain_key = http.get_domain_key("http://example.com")
			
 
				+
			
 
				+        original_get = cache.get
			
 
				+
			
 
				+        def cache_get(key):
			
 
				+            if key == domain_key:
			
 
				+                return original_get(key)
			
 
				+
			
 
				+        with patch("sentry.utils.cache.cache.get", side_effect=cache_get):
			
 
				+            responses.add(
			
 
				+                responses.GET,
			
 
				+                "http://example.com",
			
 
				+                body=b"Stuff",
			
 
				+                content_type="application/json; charset=utf-8",
			
 
				+            )
			
 
				+
			
 
				+            with pytest.raises(http.CannotFetch) as exc:
			
 
				+                fetch_file("http://example.com")
			
 
				+
			
 
				+            assert exc.value.data["type"] == EventError.TOO_LARGE_FOR_CACHE
			
 
				+
			
 
				+            assert cache.get(domain_key) == {
			
 
				+                "type": "too_large_for_cache",
			
 
				+                "url": "http://example.com",
			
 
				+            }
			
 
				+
			
 
				     @responses.activate
			
 
				     def test_truncated(self):
			
 
				         url = truncatechars("http://example.com", 3)