Browse Source

ref: handle more cases where urlsplit/urlparse receive invalid urls (#62395)

also centralized the url handling so it's easier to apply for 3.11

<!-- Describe your PR here. -->
anthony sottile 1 year ago
parent
commit
ccbe5e3f14

+ 1 - 1
src/sentry/api/endpoints/api_tokens.py

@@ -17,7 +17,7 @@ from sentry.api.serializers import serialize
 from sentry.auth.superuser import is_active_superuser
 from sentry.models.apitoken import ApiToken
 from sentry.models.outbox import outbox_context
-from sentry.security import capture_security_activity
+from sentry.security.utils import capture_security_activity
 
 
 class ApiTokenSerializer(serializers.Serializer):

+ 1 - 1
src/sentry/api/endpoints/user_authenticator_details.py

@@ -14,7 +14,7 @@ from sentry.auth.authenticators.u2f import decode_credential_id
 from sentry.auth.superuser import is_active_superuser
 from sentry.models.authenticator import Authenticator
 from sentry.models.user import User
-from sentry.security import capture_security_activity
+from sentry.security.utils import capture_security_activity
 from sentry.utils.auth import MFA_SESSION_KEY
 
 

+ 1 - 1
src/sentry/api/endpoints/user_authenticator_enroll.py

@@ -20,7 +20,7 @@ from sentry.auth.authenticators.base import EnrollmentStatus, NewEnrollmentDisal
 from sentry.auth.authenticators.sms import SMSRateLimitExceeded
 from sentry.models.authenticator import Authenticator
 from sentry.models.user import User
-from sentry.security import capture_security_activity
+from sentry.security.utils import capture_security_activity
 from sentry.services.hybrid_cloud.organization import organization_service
 from sentry.utils.auth import MFA_SESSION_KEY
 

+ 1 - 1
src/sentry/api/endpoints/user_password.py

@@ -8,7 +8,7 @@ from sentry.api.api_publish_status import ApiPublishStatus
 from sentry.api.base import control_silo_endpoint
 from sentry.api.bases.user import UserEndpoint
 from sentry.auth import password_validation
-from sentry.security import capture_security_activity
+from sentry.security.utils import capture_security_activity
 
 
 class UserPasswordSerializer(serializers.Serializer):

+ 2 - 22
src/sentry/eventtypes/security.py

@@ -1,29 +1,9 @@
-from urllib.parse import urlsplit, urlunsplit
-
+from sentry.security import csp
 from sentry.utils.safe import get_path
 from sentry.utils.strings import strip
 
 from .base import BaseEvent
 
-LOCAL = "'self'"
-
-
-def _normalize_uri(value):
-    if value in ("", LOCAL, LOCAL.strip("'")):
-        return LOCAL
-
-    # A lot of these values get reported as literally
-    # just the scheme. So a value like 'data' or 'blob', which
-    # are valid schemes, just not a uri. So we want to
-    # normalize it into a uri.
-    if ":" not in value:
-        scheme, hostname = value, ""
-    else:
-        scheme, hostname = urlsplit(value)[:2]
-        if scheme in ("http", "https"):
-            return hostname
-    return urlunsplit((scheme, hostname, "", None, None))
-
 
 class SecurityEvent(BaseEvent):
     def extract_metadata(self, data):
@@ -52,7 +32,7 @@ class CspEvent(SecurityEvent):
 
     def extract_metadata(self, data):
         metadata = SecurityEvent.extract_metadata(self, data)
-        metadata["uri"] = _normalize_uri(data["csp"].get("blocked_uri") or "")
+        metadata["uri"] = csp.normalize_value(data["csp"].get("blocked_uri") or "")
         metadata["directive"] = data["csp"].get("effective_directive")
         return metadata
 

+ 6 - 32
src/sentry/interfaces/security.py

@@ -1,12 +1,12 @@
-__all__ = ("Csp", "Hpkp", "ExpectCT", "ExpectStaple")
-
-from urllib.parse import urlsplit, urlunsplit
-
 from sentry.interfaces.base import Interface
+from sentry.security import csp
 from sentry.utils import json
 from sentry.utils.cache import memoize
 from sentry.web.helpers import render_to_string
 
+__all__ = ("Csp", "Hpkp", "ExpectCT", "ExpectStaple")
+
+
 # Default block list sourced from personal experience as well as
 # reputable blogs from Twitter and Dropbox
 DEFAULT_DISALLOWED_SOURCES = (
@@ -158,7 +158,6 @@ class Csp(SecurityReport):
     >>> }
     """
 
-    LOCAL = "'self'"
     score = 1300
     display_score = 1300
 
@@ -182,7 +181,7 @@ class Csp(SecurityReport):
 
     @memoize
     def normalized_blocked_uri(self):
-        return self._normalize_uri(self.blocked_uri)
+        return csp.normalize_value(self.blocked_uri)
 
     @memoize
     def local_script_violation_type(self):
@@ -192,35 +191,10 @@ class Csp(SecurityReport):
         if (
             self.violated_directive
             and self.effective_directive == "script-src"
-            and self.normalized_blocked_uri == self.LOCAL
+            and self.normalized_blocked_uri == csp.LOCAL
         ):
             if "'unsafe-inline'" in self.violated_directive:
                 return "unsafe-inline"
             elif "'unsafe-eval'" in self.violated_directive:
                 return "unsafe-eval"
         return None
-
-    def _normalize_uri(self, value):
-        if value in ("", self.LOCAL, self.LOCAL.strip("'")):
-            return self.LOCAL
-
-        # A lot of these values get reported as literally
-        # just the scheme. So a value like 'data' or 'blob', which
-        # are valid schemes, just not a uri. So we want to
-        # normalize it into a uri.
-        if ":" not in value:
-            scheme, hostname = value, ""
-        else:
-            try:
-                scheme, hostname = urlsplit(value)[:2]
-            except ValueError:
-                # best effort url splitting
-                scheme, _, rest = value.partition("://")
-                hostname, _, _ = rest.partition("/")
-
-            if scheme in ("http", "https"):
-                return hostname
-        return self._unsplit(scheme, hostname)
-
-    def _unsplit(self, scheme, hostname):
-        return urlunsplit((scheme, hostname, "", None, None))

+ 0 - 1
src/sentry/security/__init__.py

@@ -1 +0,0 @@
-from .utils import *  # NOQA

+ 24 - 0
src/sentry/security/csp.py

@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from urllib.parse import urlunsplit
+
+from sentry.utils.urls import urlsplit_best_effort
+
+LOCAL = "'self'"
+
+
+def normalize_value(value: str) -> str:
+    if value in ("", LOCAL, LOCAL.strip("'")):
+        return LOCAL
+
+    # A lot of these values get reported as literally
+    # just the scheme. So a value like 'data' or 'blob', which
+    # are valid schemes, just not a uri. So we want to
+    # normalize it into a uri.
+    if ":" not in value:
+        scheme, hostname = value, ""
+    else:
+        scheme, hostname, _, _ = urlsplit_best_effort(value)
+        if scheme in ("http", "https"):
+            return hostname
+    return urlunsplit((scheme, hostname, "", None, None))

+ 3 - 11
src/sentry/spans/grouping/strategy/base.py

@@ -1,9 +1,9 @@
 import re
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, Optional, Sequence, TypedDict, Union
-from urllib.parse import urlparse
 
 from sentry.spans.grouping.utils import Hash, parse_fingerprint_var
+from sentry.utils import urls
 
 
 class Span(TypedDict):
@@ -247,16 +247,8 @@ def remove_http_client_query_string_strategy(span: Span) -> Optional[Sequence[st
     if method not in HTTP_METHODS:
         return None
 
-    try:
-        url = urlparse(url_str)
-    except ValueError:
-        # attempt a "best effort" splitting
-        url_str, _, _ = url_str.partition("?")
-        scheme, _, rest = url_str.partition("://")
-        netloc, _, path = rest.partition("/")
-        return [method, scheme, netloc, path]
-    else:
-        return [method, url.scheme, url.netloc, url.path]
+    scheme, netloc, path, _ = urls.urlsplit_best_effort(url_str)
+    return [method, scheme, netloc, path]
 
 
 @span_op(["redis", "db.redis"])

+ 15 - 1
src/sentry/utils/urls.py

@@ -1,6 +1,6 @@
 import re
 from typing import MutableMapping, Sequence, Union
-from urllib.parse import parse_qs, parse_qsl, urlencode, urljoin, urlparse, urlunparse
+from urllib.parse import parse_qs, parse_qsl, urlencode, urljoin, urlparse, urlsplit, urlunparse
 
 _scheme_re = re.compile(r"^([a-zA-Z0-9-+]+://)(.*)$")
 
@@ -59,3 +59,17 @@ def parse_link(url: str) -> str:
         new_path.append(item)
 
     return "/".join(new_path) + "/" + str(url_parts[4])
+
+
+def urlsplit_best_effort(s: str) -> tuple[str, str, str, str]:
+    """first attempts urllib parsing, falls back to crude parsing for invalid urls"""
+    try:
+        parsed = urlsplit(s)
+    except ValueError:
+        rest, _, query = s.partition("?")
+        scheme, _, rest = rest.partition("://")
+        netloc, slash, path = rest.partition("/")
+        path = f"{slash}{path}"
+        return scheme, netloc, path, query
+    else:
+        return parsed.scheme, parsed.netloc, parsed.path, parsed.query

Some files were not shown because too many files changed in this diff