Browse Source

feat(suspect-commits): Add more logging to GitHub blame fetching (#59194)

To better debug some issues I'm seeing, this tags all logs with the
group/event/org IDs. It also adds more resolution as to _why_ a commit
might have been thrown out.
Malachi Willey 1 year ago
parent
commit
e24132fc2f

+ 97 - 53
src/sentry/integrations/github/blame.py

@@ -3,12 +3,13 @@ from __future__ import annotations
 import logging
 from dataclasses import asdict
 from datetime import timezone
-from typing import Dict, Optional, Sequence, TypedDict
+from typing import Any, Dict, Mapping, Optional, Sequence, TypedDict
 
 from django.utils.datastructures import OrderedSet
 from isodate import parse_datetime
 
 from sentry.integrations.mixins.commit_context import CommitInfo, FileBlameInfo, SourceLineInfo
+from sentry.utils import json
 
 logger = logging.getLogger("sentry.integrations.github")
 
@@ -49,9 +50,7 @@ FilePathMapping = Dict[str, Dict[str, OrderedSet]]
 GitHubRepositoryResponse = Dict[str, GitHubRefResponse]
 
 
-def generate_file_path_mapping(
-    files: Sequence[SourceLineInfo],
-) -> FilePathMapping:
+def generate_file_path_mapping(files: Sequence[SourceLineInfo]) -> FilePathMapping:
     """
     Generates a nested mapping of repo -> ref -> file paths.
     This map is used to dedupe matching repos, refs, and file paths and only query
@@ -67,7 +66,7 @@ def generate_file_path_mapping(
     return file_path_mapping
 
 
-def create_blame_query(file_path_mapping: FilePathMapping) -> str:
+def create_blame_query(file_path_mapping: FilePathMapping, extra: Mapping[str, Any]) -> str:
     """
     Create a GraphQL query to get blame information for a list of files
     """
@@ -86,11 +85,21 @@ def create_blame_query(file_path_mapping: FilePathMapping) -> str:
         try:
             [repo_owner, repo_name] = full_repo_name.split("/", maxsplit=1)
         except ValueError:
+            logger.exception(
+                "get_blame_for_files.create_blame_query.invalid_repo_name",
+                extra={**extra, "repo_name": full_repo_name},
+            )
             continue
 
         repo_queries += _make_repo_query(repo_name, repo_owner, ref_queries, repo_index)
 
-    return f"""query {{{repo_queries}\n}}"""
+    query = f"""query {{{repo_queries}\n}}"""
+
+    logger.info(
+        "get_blame_for_files.create_blame_query.created_query", extra={**extra, "query": query}
+    )
+
+    return query
 
 
 def extract_commits_from_blame_response(
@@ -105,6 +114,10 @@ def extract_commits_from_blame_response(
     back to the correct file.
     """
     file_blames: list[FileBlameInfo] = []
+    logger.info(
+        "get_blame_for_files.extract_commits_from_blame.missing_repository",
+        extra={**extra, "response": json.dumps(response)},
+    )
     for repo_index, (full_repo_name, ref_mapping) in enumerate(file_path_mapping.items()):
         repo_mapping: Optional[GitHubRepositoryResponse] = response.get("data", {}).get(
             f"repository{repo_index}"
@@ -138,77 +151,108 @@ def extract_commits_from_blame_response(
                         },
                     )
                     continue
-                matching_file, blame_info = _get_matching_file_and_blame(
-                    files=files,
-                    blame_ranges=blame.get("ranges", []),
-                    path=file_path,
-                    repo_name=full_repo_name,
-                    ref=ref_name,
-                )
-                if not blame_info:
-                    logger.error(
-                        "get_blame_for_files.extract_commits_from_blame.missing_line_blame",
-                        extra={
-                            **extra,
-                            "file_lineno": matching_file.lineno,
-                            "file_path": matching_file.path,
-                            "branch_name": matching_file.ref,
-                            "repo_name": full_repo_name,
-                        },
+                matching_files = [
+                    f
+                    for f in files
+                    if f.path == file_path and f.repo.name == full_repo_name and f.ref == ref_name
+                ]
+                for file in matching_files:
+                    log_info = {
+                        **extra,
+                        "file_lineno": file.lineno,
+                        "file_path": file.path,
+                        "branch_name": file.ref,
+                        "repo_name": full_repo_name,
+                    }
+                    blame_info = _get_matching_file_blame(
+                        file=file,
+                        blame_ranges=blame.get("ranges", []),
+                        extra=log_info,
                     )
-                    continue
-                file_blames.append(blame_info)
+                    if not blame_info:
+                        continue
+                    file_blames.append(blame_info)
     return file_blames
 
 
-def _get_matching_file_and_blame(
-    files: Sequence[SourceLineInfo],
+def _get_matching_file_blame(
+    file: SourceLineInfo,
     blame_ranges: Sequence[GitHubFileBlameRange],
-    path: str,
-    repo_name: str,
-    ref: str,
-) -> tuple[SourceLineInfo, Optional[FileBlameInfo]]:
+    extra: dict[str, str | int | None],
+) -> Optional[FileBlameInfo]:
     """
-    Generates a FileBlameInfo object for the given file path, repo name, and ref.
-    Combines matching objects from the initial file list and the blame range
-    returned from the GraphQL response to create the FileBlameInfo.
+    Generates a FileBlameInfo object for the given file. Searches the given blame range
+    and validates that the commit is valid before creating the FileBlameInfo object.
     """
-    matching_file = [
-        f for f in files if f.path == path and f.repo.name == repo_name and f.ref == ref
-    ][0]
     matching_blame_range = next(
-        iter(
-            [
-                r
-                for r in blame_ranges
-                if r["startingLine"] <= matching_file.lineno <= r["endingLine"]
-            ]
-        ),
+        iter([r for r in blame_ranges if r["startingLine"] <= file.lineno <= r["endingLine"]]),
         None,
     )
     if not matching_blame_range:
-        return matching_file, None
+        logger.error(
+            "get_blame_for_files.extract_commits_from_blame.no_matching_blame_range",
+            extra=extra,
+        )
+        return None
 
     commit: Optional[GitHubFileBlameCommit] = matching_blame_range.get("commit", None)
     if not commit:
-        return matching_file, None
-    committed_date = commit.get("committedDate")
-    if not committed_date:
-        return matching_file, None
+        logger.error(
+            "get_blame_for_files.extract_commits_from_blame.no_commit_data",
+            extra=extra,
+        )
+        return None
+
+    committed_date_str = commit.get("committedDate")
+    commit_id = commit.get("oid")
+
+    if not commit_id:
+        logger.error(
+            "get_blame_for_files.extract_commits_from_blame.invalid_commit_response",
+            extra={
+                **extra,
+                "reason": "Missing property oid",
+            },
+        )
+        return None
+    if not committed_date_str:
+        logger.error(
+            "get_blame_for_files.extract_commits_from_blame.invalid_commit_response",
+            extra={
+                **extra,
+                "commit_id": commit_id,
+                "reason": "Missing property committedDate",
+            },
+        )
+        return None
+
+    try:
+        committed_date = parse_datetime(committed_date_str).astimezone(timezone.utc)
+    except Exception:
+        logger.exception(
+            "get_blame_for_files.extract_commits_from_blame.invalid_commit_response",
+            extra={
+                **extra,
+                "commit_id": commit_id,
+                "committed_date": committed_date_str,
+                "reason": "Failed to convert committed date to datetime.",
+            },
+        )
+        return None
 
     author = commit.get("author")
     blame = FileBlameInfo(
-        **asdict(matching_file),
+        **asdict(file),
         commit=CommitInfo(
-            commitId=commit.get("oid"),
+            commitId=commit_id,
             commitAuthorName=author.get("name") if author else None,
             commitAuthorEmail=author.get("email") if author else None,
             commitMessage=commit.get("message"),
-            committedDate=parse_datetime(committed_date).astimezone(timezone.utc),
+            committedDate=committed_date,
         ),
     )
 
-    return matching_file, blame
+    return blame
 
 
 def _make_ref_query(ref: str, blame_queries: str, index: int) -> str:

+ 16 - 15
src/sentry/integrations/github/client.py

@@ -690,7 +690,14 @@ class GitHubClientMixin(GithubProxyClient):
 
         return response_ref.get("target", {}).get("blame", {}).get("ranges", [])
 
-    def get_blame_for_files(self, files: Sequence[SourceLineInfo]) -> Sequence[FileBlameInfo]:
+    def get_blame_for_files(
+        self, files: Sequence[SourceLineInfo], extra: Mapping[str, Any]
+    ) -> Sequence[FileBlameInfo]:
+        log_info = {
+            **extra,
+            "provider": "github",
+            "organization_integration_id": self.org_integration_id,
+        }
         rate_limit = self.get_rate_limit(specific_resource="graphql")
         if rate_limit.remaining < MINIMUM_REQUESTS:
             metrics.incr("sentry.integrations.github.get_blame_for_files.rate_limit")
@@ -711,17 +718,11 @@ class GitHubClientMixin(GithubProxyClient):
         try:
             response = self.post(
                 path="/graphql",
-                data={"query": create_blame_query(file_path_mapping)},
+                data={"query": create_blame_query(file_path_mapping, extra=log_info)},
                 allow_text=False,
             )
         except ValueError as e:
-            logger.exception(
-                e,
-                {
-                    "provider": "github",
-                    "organization_integration_id": self.org_integration_id,
-                },
-            )
+            logger.exception(e, log_info)
             return []
 
         if not isinstance(response, MappingApiResponse):
@@ -733,18 +734,18 @@ class GitHubClientMixin(GithubProxyClient):
             )
             logger.error(
                 "get_blame_for_files.graphql_error",
-                extra={
-                    "provider": "github",
-                    "error": err_message,
-                    "organization_integration_id": self.org_integration_id,
-                },
+                extra={**log_info, "error_message": err_message},
             )
 
         return extract_commits_from_blame_response(
             response=response,
             file_path_mapping=file_path_mapping,
             files=files,
-            extra={"provider": "github", "organization_integration_id": self.org_integration_id},
+            extra={
+                **extra,
+                "provider": "github",
+                "organization_integration_id": self.org_integration_id,
+            },
         )
 
 

+ 2 - 2
src/sentry/integrations/github/integration.py

@@ -247,9 +247,9 @@ class GitHubIntegration(IntegrationInstallation, GitHubIssueBasic, RepositoryMix
         return True
 
     def get_commit_context_all_frames(
-        self, files: Sequence[SourceLineInfo]
+        self, files: Sequence[SourceLineInfo], extra: Mapping[str, Any]
     ) -> Sequence[FileBlameInfo]:
-        return self.get_blame_for_files(files)
+        return self.get_blame_for_files(files, extra)
 
     def get_commit_context(
         self, repo: Repository, filepath: str, ref: str, event_frame: Mapping[str, Any]

+ 10 - 4
src/sentry/integrations/mixins/commit_context.py

@@ -40,7 +40,9 @@ class GetBlameForFile(Protocol):
     ) -> list[dict[str, Any]] | None:
         ...
 
-    def get_blame_for_files(self, files: Sequence[SourceLineInfo]) -> list[FileBlameInfo]:
+    def get_blame_for_files(
+        self, files: Sequence[SourceLineInfo], extra: Mapping[str, Any]
+    ) -> list[FileBlameInfo]:
         ...
 
 
@@ -78,7 +80,9 @@ class CommitContextMixin(GetClient):
 
         return response
 
-    def get_blame_for_files(self, files: Sequence[SourceLineInfo]) -> list[FileBlameInfo]:
+    def get_blame_for_files(
+        self, files: Sequence[SourceLineInfo], extra: Mapping[str, Any]
+    ) -> list[FileBlameInfo]:
         """
         Calls the client's `get_blame_for_files` method to fetch blame for a list of files.
 
@@ -89,7 +93,7 @@ class CommitContextMixin(GetClient):
         except Identity.DoesNotExist:
             return []
         try:
-            response = client.get_blame_for_files(files)
+            response = client.get_blame_for_files(files, extra)
         except IdentityNotValid:
             return []
         except ApiError as e:
@@ -97,7 +101,9 @@ class CommitContextMixin(GetClient):
 
         return response
 
-    def get_commit_context_all_frames(self, files: Sequence[SourceLineInfo]) -> list[FileBlameInfo]:
+    def get_commit_context_all_frames(
+        self, files: Sequence[SourceLineInfo], extra: Mapping[str, Any]
+    ) -> list[FileBlameInfo]:
         """
         Given a list of source files and line numbers,returns the commit info for the most recent commit.
         """

+ 1 - 1
src/sentry/integrations/utils/commit_context.py

@@ -382,7 +382,7 @@ def _get_blames_from_all_integrations(
             integration.provider,
         )
         try:
-            blames = install.get_commit_context_all_frames(files)
+            blames = install.get_commit_context_all_frames(files, extra=extra)
             file_blames.extend(blames)
         except Exception as e:
             log_info = {

+ 134 - 11
tests/sentry/integrations/github/test_client.py

@@ -952,7 +952,7 @@ class GitHubClientFileBlameQueryBuilderTest(GitHubClientFileBlameBase):
             content_type="application/json",
         )
 
-        self.github_client.get_blame_for_files([file1, file2, file3])
+        self.github_client.get_blame_for_files([file1, file2, file3], extra={})
         assert json.loads(responses.calls[2].request.body)["query"] == query
 
     @mock.patch("sentry.integrations.github.client.get_jwt", return_value=b"jwt_token_1")
@@ -1059,7 +1059,7 @@ class GitHubClientFileBlameQueryBuilderTest(GitHubClientFileBlameBase):
             content_type="application/json",
         )
 
-        self.github_client.get_blame_for_files([file1, file2, file3])
+        self.github_client.get_blame_for_files([file1, file2, file3], extra={})
         assert json.loads(responses.calls[2].request.body)["query"] == query
 
     @mock.patch("sentry.integrations.github.client.get_jwt", return_value=b"jwt_token_1")
@@ -1148,7 +1148,7 @@ class GitHubClientFileBlameQueryBuilderTest(GitHubClientFileBlameBase):
             content_type="application/json",
         )
 
-        self.github_client.get_blame_for_files([file1, file2, file3])
+        self.github_client.get_blame_for_files([file1, file2, file3], extra={})
         assert json.loads(responses.calls[2].request.body)["query"] == query
 
 
@@ -1164,7 +1164,7 @@ class GitHubClientFileBlameResponseTest(GitHubClientFileBlameBase):
     @responses.activate
     def test_get_blame_for_files_full_response(self, get_jwt):
         """
-        Tests that the correct commits are returned when a full response is returned
+        Tests that the correct commits are selected from the blame range when a full response is returned.
         """
         file1 = SourceLineInfo(
             path="src/sentry/integrations/github/client_1.py",
@@ -1175,7 +1175,7 @@ class GitHubClientFileBlameResponseTest(GitHubClientFileBlameBase):
         )
         file2 = SourceLineInfo(
             path="src/sentry/integrations/github/client_1.py",
-            lineno=15,
+            lineno=20,
             ref="master",
             repo=self.repo_1,
             code_mapping=None,  # type:ignore
@@ -1218,14 +1218,25 @@ class GitHubClientFileBlameResponseTest(GitHubClientFileBlameBase):
                                     "endingLine": 15,
                                     "age": 0,
                                 },
+                                {
+                                    "commit": {
+                                        "oid": "456",
+                                        "author": {"name": "foo2", "email": "foo2@example.com"},
+                                        "message": "hello",
+                                        "committedDate": "2021-01-01T00:00:00Z",
+                                    },
+                                    "startingLine": 16,
+                                    "endingLine": 21,
+                                    "age": 0,
+                                },
                             ]
                         },
                         "blame1": {
                             "ranges": [
                                 {
                                     "commit": {
-                                        "oid": "456",
-                                        "author": {"name": "foo2", "email": "foo2@example.com"},
+                                        "oid": "789",
+                                        "author": {"name": "foo3", "email": "foo3@example.com"},
                                         "message": "hello",
                                         "committedDate": "2020-01-01T00:00:00Z",
                                     },
@@ -1249,7 +1260,7 @@ class GitHubClientFileBlameResponseTest(GitHubClientFileBlameBase):
             content_type="application/json",
         )
 
-        response = self.github_client.get_blame_for_files([file1, file2, file3])
+        response = self.github_client.get_blame_for_files([file1, file2, file3], extra={})
         self.assertEqual(
             response,
             [
@@ -1264,12 +1275,22 @@ class GitHubClientFileBlameResponseTest(GitHubClientFileBlameBase):
                     ),
                 ),
                 FileBlameInfo(
-                    **asdict(file3),
+                    **asdict(file2),
                     commit=CommitInfo(
                         commitId="456",
                         commitAuthorName="foo2",
                         commitAuthorEmail="foo2@example.com",
                         commitMessage="hello",
+                        committedDate=datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
+                    ),
+                ),
+                FileBlameInfo(
+                    **asdict(file3),
+                    commit=CommitInfo(
+                        commitId="789",
+                        commitAuthorName="foo3",
+                        commitAuthorEmail="foo3@example.com",
+                        commitMessage="hello",
                         committedDate=datetime(2020, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
                     ),
                 ),
@@ -1346,7 +1367,7 @@ class GitHubClientFileBlameResponseTest(GitHubClientFileBlameBase):
             content_type="application/json",
         )
 
-        response = self.github_client.get_blame_for_files([file1, file2, file3, file4])
+        response = self.github_client.get_blame_for_files([file1, file2, file3, file4], extra={})
         self.assertEqual(
             response,
             [
@@ -1363,6 +1384,108 @@ class GitHubClientFileBlameResponseTest(GitHubClientFileBlameBase):
             ],
         )
 
+    @mock.patch("sentry.integrations.github.client.logger.error")
+    @mock.patch("sentry.integrations.github.client.get_jwt", return_value=b"jwt_token_1")
+    @responses.activate
+    def test_get_blame_for_files_invalid_commit(self, get_jwt, mock_logger_error):
+        """
+        Tests commits that have invalid data are skipped and logged
+        """
+        file1 = SourceLineInfo(
+            path="src/sentry/integrations/github/client_1.py",
+            lineno=10,
+            ref="master",
+            repo=self.repo_1,
+            code_mapping=None,  # type: ignore
+        )
+        file2 = SourceLineInfo(
+            path="src/sentry/integrations/github/client_2.py",
+            lineno=15,
+            ref="master",
+            repo=self.repo_1,
+            code_mapping=None,  # type:ignore
+        )
+        data = {
+            "repository0": {
+                "ref0": {
+                    "target": {
+                        "blame0": {
+                            "ranges": [
+                                {
+                                    "commit": {
+                                        "oid": None,
+                                        "author": None,
+                                        "message": None,
+                                        "committedDate": "2022-01-01T00:00:00Z",
+                                    },
+                                    "startingLine": 10,
+                                    "endingLine": 15,
+                                    "age": 0,
+                                }
+                            ]
+                        },
+                        "blame1": {
+                            "ranges": [
+                                {
+                                    "commit": {
+                                        "oid": "123",
+                                        "author": None,
+                                        "message": None,
+                                        "committedDate": None,
+                                    },
+                                    "startingLine": 10,
+                                    "endingLine": 15,
+                                    "age": 0,
+                                }
+                            ]
+                        },
+                    }
+                }
+            },
+        }
+
+        responses.add(
+            method=responses.POST,
+            url="https://api.github.com/graphql",
+            json={
+                "data": data,
+            },
+            content_type="application/json",
+        )
+
+        response = self.github_client.get_blame_for_files([file1, file2], extra={})
+        self.assertEqual(response, [])
+
+        mock_logger_error.assert_has_calls(
+            [
+                mock.call(
+                    "get_blame_for_files.extract_commits_from_blame.invalid_commit_response",
+                    extra={
+                        "provider": "github",
+                        "organization_integration_id": self.github_client.org_integration_id,
+                        "file_lineno": file1.lineno,
+                        "file_path": file1.path,
+                        "branch_name": file1.ref,
+                        "repo_name": file1.repo.name,
+                        "reason": "Missing property oid",
+                    },
+                ),
+                mock.call(
+                    "get_blame_for_files.extract_commits_from_blame.invalid_commit_response",
+                    extra={
+                        "provider": "github",
+                        "organization_integration_id": self.github_client.org_integration_id,
+                        "file_lineno": file2.lineno,
+                        "file_path": file2.path,
+                        "branch_name": file2.ref,
+                        "repo_name": file2.repo.name,
+                        "commit_id": "123",
+                        "reason": "Missing property committedDate",
+                    },
+                ),
+            ]
+        )
+
 
 class GitHubClientFileBlameRateLimitTest(GitHubClientFileBlameBase):
     """
@@ -1410,7 +1533,7 @@ class GitHubClientFileBlameRateLimitTest(GitHubClientFileBlameBase):
     @responses.activate
     def test_rate_limit_exceeded(self, get_jwt, mock_logger_error):
         with pytest.raises(GitHubApproachingRateLimit):
-            self.github_client.get_blame_for_files([self.file])
+            self.github_client.get_blame_for_files([self.file], extra={})
         mock_logger_error.assert_called_with(
             "sentry.integrations.github.get_blame_for_files.rate_limit",
             extra={

+ 12 - 2
tests/sentry/tasks/test_commit_context.py

@@ -784,7 +784,12 @@ class TestCommitContextAllFrames(TestCommitContextMixin):
                     repo=other_code_mapping.repository,
                     code_mapping=other_code_mapping,
                 )
-            ]
+            ],
+            extra={
+                "event": self.event.event_id,
+                "group": self.event.group_id,
+                "organization": self.event.project.organization_id,
+            },
         )
 
     @patch("sentry.tasks.groupowner.process_suspect_commits.delay")
@@ -1203,7 +1208,12 @@ class TestCommitContextAllFrames(TestCommitContextMixin):
                     repo=self.repo,
                     code_mapping=self.code_mapping,
                 ),
-            ]
+            ],
+            extra={
+                "event": self.event.event_id,
+                "group": self.event.group_id,
+                "organization": self.organization.id,
+            },
         )
         mock_record.assert_any_call(
             "integrations.successfully_fetched_commit_context_all_frames",