2 years ago · 1c4b690d71
--- a/src/sentry/integrations/github/client.py
+++ b/src/sentry/integrations/github/client.py
@@ -18,6 +18,11 @@ from sentry.utils.json import JSONData
 
				 
			
 
				 logger = logging.getLogger("sentry.integrations.github")
			
 
				 
			
 
				+# Some functions that require a large number of API requests can use this value
			
 
				+# as the lower ceiling before hitting Github anymore, thus, leaving at least these
			
 
				+# many requests left for other features that need to reach Github
			
 
				+MINIMUM_REQUESTS = 200
			
 
				+
			
 
				 
			
 
				 class GithubRateLimitInfo:
			
 
				     def __init__(self, info: Dict[str, int]) -> None:
			
@@ -132,6 +137,7 @@ class GitHubClientMixin(ApiClient):  # type: ignore
 
				         repo_full_name: str,
			
 
				         tree_sha: str,
			
 
				         only_source_code_files: bool = True,
			
 
				+        only_use_cache: bool = False,
			
 
				         cache_seconds: int = 3600 * 24,
			
 
				     ) -> List[str]:
			
 
				         """It return all files for a repo or just source code files.
			
@@ -139,10 +145,13 @@ class GitHubClientMixin(ApiClient):  # type: ignore
 
				         repo_full_name: e.g. getsentry/sentry
			
 
				         tree_sha: A branch or a commit sha
			
 
				         only_source_code_files: Include all files or just the source code files
			
 
				+        only_use_cache: Do not hit the network but use the value from the cache
			
 
				+            if any. This is useful if the remaining API requests are low
			
 
				+        cache_seconds: How long to cache a value for
			
 
				         """
			
 
				         key = f"github:repo:{repo_full_name}:{'source-code' if only_source_code_files else 'all'}"
			
 
				         repo_files: List[str] = cache.get(key, [])
			
 
				-        if not repo_files:
			
 
				+        if not repo_files and not only_use_cache:
			
 
				             tree = self.get_tree(repo_full_name, tree_sha)
			
 
				             if tree:
			
 
				                 # Keep files; discard directories
			
@@ -169,10 +178,12 @@ class GitHubClientMixin(ApiClient):  # type: ignore
 
				         extra = {"gh_org": gh_org}
			
 
				         try:
			
 
				             repositories = self._populate_repositories(gh_org, cache_seconds)
			
 
				-            # XXX: In order to speed up this function we will need to parallelize this
			
 
				-            # Use ThreadPoolExecutor; see src/sentry/utils/snuba.py#L358
			
 
				-            for repo_info in repositories:
			
 
				-                trees[repo_info["full_name"]] = self._populate_tree(repo_info)
			
 
				+            trees = self._populate_trees(repositories)
			
 
				+
			
 
				+            rate_limit = self.get_rate_limit()
			
 
				+            extra.update(
			
 
				+                {"remaining": str(rate_limit.remaining), "repos_num": str(len(repositories))}
			
 
				+            )
			
 
				             logger.info("Using cached trees for Github org.", extra=extra)
			
 
				         except ApiError as error:
			
 
				             msg = error.text
			
@@ -205,10 +216,44 @@ class GitHubClientMixin(ApiClient):  # type: ignore
 
				 
			
 
				         return repositories
			
 
				 
			
 
				-    def _populate_tree(self, repo_info: Dict[str, str]) -> RepoTree:
			
 
				+    def _populate_trees(self, repositories: List[Dict[str, str]]) -> Dict[str, RepoTree]:
			
 
				+        """
			
 
				+        For every repository, fetch the tree associated and cache it.
			
 
				+        This function takes API rate limits into consideration to prevent exhaustion.
			
 
				+        """
			
 
				+        trees: Dict[str, RepoTree] = {}
			
 
				+        only_use_cache = False
			
 
				+
			
 
				+        rate_limit = self.get_rate_limit()
			
 
				+        remaining_requests = rate_limit.remaining
			
 
				+        logger.info("Current rate limit info.", extra={"rate_limit": rate_limit})
			
 
				+
			
 
				+        for index, repo_info in enumerate(repositories):
			
 
				+            # Only use the cache if we drop below the lower ceiling
			
 
				+            # We will fetch after the limit is reset (every hour)
			
 
				+            if not only_use_cache and remaining_requests <= MINIMUM_REQUESTS:
			
 
				+                only_use_cache = True
			
 
				+                logger.info(
			
 
				+                    "Too few requests remaining. Grabbing values from the cache.",
			
 
				+                    extra={"repo_info": repo_info},
			
 
				+                )
			
 
				+            # The Github API rate limit is reset every hour
			
 
				+            # Spread the expiration of the cache of each repo across the day
			
 
				+            trees[repo_info["full_name"]] = self._populate_tree(
			
 
				+                repo_info, only_use_cache, (3600 * 24) + (3600 * (index % 24))
			
 
				+            )
			
 
				+            remaining_requests -= 1
			
 
				+
			
 
				+        return trees
			
 
				+
			
 
				+    def _populate_tree(
			
 
				+        self, repo_info: Dict[str, str], only_use_cache: bool, cache_seconds: int
			
 
				+    ) -> RepoTree:
			
 
				         full_name = repo_info["full_name"]
			
 
				         branch = repo_info["default_branch"]
			
 
				-        repo_files = self.get_cached_repo_files(full_name, branch)
			
 
				+        repo_files = self.get_cached_repo_files(
			
 
				+            full_name, branch, only_use_cache=only_use_cache, cache_seconds=cache_seconds
			
 
				+        )
			
 
				         return RepoTree(Repo(full_name, branch), repo_files)
			
 
				 
			
 
				     def get_repositories(self, fetch_max_pages: bool = False) -> Sequence[JSONData]:
			
--- a/src/sentry/integrations/github/integration.py
+++ b/src/sentry/integrations/github/integration.py
@@ -118,7 +118,9 @@ class GitHubIntegration(IntegrationInstallation, GitHubIssueBasic, RepositoryMix
 
				             id=self.org_integration.organization_id, user_id=None
			
 
				         )
			
 
				         if not organization_context:
			
 
				-            logger.exception("No organization information was found.", extra=extra)
			
 
				+            logger.exception(
			
 
				+                "No organization information was found. Continuing execution.", extra=extra
			
 
				+            )
			
 
				         else:
			
 
				             trees = self.get_client().get_trees_for_org(gh_org=gh_org, cache_seconds=cache_seconds)
			
 
				 
			
--- a/tests/sentry/integrations/github/test_client.py
+++ b/tests/sentry/integrations/github/test_client.py
@@ -76,7 +76,7 @@ class GitHubAppsClientTest(TestCase):
 
				             assert gh_rate_limit.used == 7
			
 
				             assert gh_rate_limit.next_window() == "17:39:49"
			
 
				 
			
 
				-    def test_get_rate_limit_specific_resouces(self):
			
 
				+    def test_get_rate_limit_non_existant_resouce(self):
			
 
				         with pytest.raises(AssertionError):
			
 
				             self.client.get_rate_limit("foo")
			
 
				 
			
--- a/tests/sentry/integrations/github/test_integration.py
+++ b/tests/sentry/integrations/github/test_integration.py
@@ -6,7 +6,7 @@ from django.urls import reverse
 
				 
			
 
				 import sentry
			
 
				 from sentry.constants import ObjectStatus
			
 
				-from sentry.integrations.github import API_ERRORS, GitHubIntegrationProvider
			
 
				+from sentry.integrations.github import API_ERRORS, MINIMUM_REQUESTS, GitHubIntegrationProvider
			
 
				 from sentry.integrations.utils.code_mapping import Repo, RepoTree
			
 
				 from sentry.models import Integration, OrganizationIntegration, Project, Repository
			
 
				 from sentry.plugins.base import plugins
			
@@ -585,6 +585,18 @@ class GitHubIntegrationTest(IntegrationTestCase):
 
				             organization=self.organization, integration=integration
			
 
				         ).exists()
			
 
				 
			
 
				+    def set_rate_limit(self, remaining, limit=5000):
			
 
				+        """Helper class to set the rate limit"""
			
 
				+        responses.add(
			
 
				+            method=responses.GET,
			
 
				+            url="https://api.github.com/rate_limit",
			
 
				+            json={
			
 
				+                "resources": {
			
 
				+                    "core": {"limit": limit, "remaining": remaining, "used": "foo", "reset": 123}
			
 
				+                }
			
 
				+            },
			
 
				+        )
			
 
				+
			
 
				     def get_installation_helper(self):
			
 
				         with self.tasks():
			
 
				             self.assert_setup_flow()  # This somehow creates the integration
			
@@ -593,36 +605,11 @@ class GitHubIntegrationTest(IntegrationTestCase):
 
				         installation = integration.get_installation(self.organization.id)
			
 
				         return installation
			
 
				 
			
 
				-    @responses.activate
			
 
				-    def test_get_trees_for_org_handles_rate_limit_reached(self):
			
 
				-        """Test that we will not hit Github's API more than once when we reach the API rate limit"""
			
 
				-        responses.replace(
			
 
				-            responses.GET,
			
 
				-            f"{self.base_url}/repos/Test-Organization/foo/git/trees/master?recursive=1",
			
 
				-            json={"message": "API rate limit exceeded for installation ID 123456."},
			
 
				-            status=403,
			
 
				-        )
			
 
				-
			
 
				-        installation = self.get_installation_helper()
			
 
				-        trees = installation.get_trees_for_org()
			
 
				-        key_prefix = "github:repo:Test-Organization"
			
 
				-        # We have access to the files because the rate limit has not been hit
			
 
				-        assert cache.get(f"{key_prefix}/xyz:source-code") == ["src/foo.py"]
			
 
				-        # These repos are None because foo hit the rate limit
			
 
				-        for repo in ("foo", "bar", "baz"):
			
 
				-            assert cache.get(f"{key_prefix}/{repo}:source-code") is None
			
 
				-        assert len(trees.keys()) == 1
			
 
				-
			
 
				-        assert trees == {
			
 
				-            "Test-Organization/xyz": RepoTree(
			
 
				-                Repo("Test-Organization/xyz", "master"), ["src/foo.py"]
			
 
				-            )
			
 
				-        }
			
 
				-
			
 
				     @responses.activate
			
 
				     def test_get_trees_for_org_works(self):
			
 
				         """Fetch the tree representation of a repo"""
			
 
				         installation = self.get_installation_helper()
			
 
				+        self.set_rate_limit(MINIMUM_REQUESTS + 50)
			
 
				         expected_trees = {
			
 
				             "Test-Organization/bar": RepoTree(Repo("Test-Organization/bar", "main"), []),
			
 
				             "Test-Organization/baz": RepoTree(Repo("Test-Organization/baz", "master"), []),
			
@@ -635,22 +622,54 @@ class GitHubIntegrationTest(IntegrationTestCase):
 
				             ),
			
 
				         }
			
 
				 
			
 
				-        assert not cache.get("githubtrees:repositories:Test-Organization")
			
 
				+        repos_key = "githubtrees:repositories:Test-Organization"
			
 
				+        repo_key = lambda x: f"github:repo:Test-Organization/{x}:source-code"
			
 
				         # Check that the cache is clear
			
 
				-        repo_key = "github:repo:Test-Organization/foo:source-code"
			
 
				-        assert cache.get("githubtrees:repositories:Test-Organization") is None
			
 
				-        assert cache.get(repo_key) is None
			
 
				+        assert cache.get(repos_key) is None
			
 
				+        assert cache.get(repo_key("foo")) is None
			
 
				         trees = installation.get_trees_for_org()
			
 
				 
			
 
				-        assert cache.get("githubtrees:repositories:Test-Organization") == [
			
 
				+        assert cache.get(repos_key) == [
			
 
				             {"full_name": "Test-Organization/xyz", "default_branch": "master"},
			
 
				             {"full_name": "Test-Organization/foo", "default_branch": "master"},
			
 
				             {"full_name": "Test-Organization/bar", "default_branch": "main"},
			
 
				             {"full_name": "Test-Organization/baz", "default_branch": "master"},
			
 
				         ]
			
 
				-        assert cache.get(repo_key) == ["src/sentry/api/endpoints/auth_login.py"]
			
 
				+        assert cache.get(repo_key("foo")) == ["src/sentry/api/endpoints/auth_login.py"]
			
 
				         assert trees == expected_trees
			
 
				 
			
 
				         # Calling a second time should produce the same results
			
 
				         trees = installation.get_trees_for_org()
			
 
				         assert trees == expected_trees
			
 
				+
			
 
				+    @responses.activate
			
 
				+    def test_get_trees_for_org_prevent_exhaustion_some_repos(self):
			
 
				+        """Some repos will hit the network but the rest will grab from the cache."""
			
 
				+        gh_org = "Test-Organization"
			
 
				+        installation = self.get_installation_helper()
			
 
				+        expected_trees = {
			
 
				+            f"{gh_org}/xyz": RepoTree(Repo(f"{gh_org}/xyz", "master"), ["src/foo.py"]),
			
 
				+            # This will have no files because we will hit the minimum remaining requests floor
			
 
				+            f"{gh_org}/foo": RepoTree(Repo(f"{gh_org}/foo", "master"), []),
			
 
				+            f"{gh_org}/bar": RepoTree(Repo(f"{gh_org}/bar", "main"), []),
			
 
				+            f"{gh_org}/baz": RepoTree(Repo(f"{gh_org}/baz", "master"), []),
			
 
				+        }
			
 
				+        with patch("sentry.integrations.github.client.MINIMUM_REQUESTS", new=5, autospec=False):
			
 
				+            # We start with one request left before reaching the minimum remaining requests floor
			
 
				+            self.set_rate_limit(remaining=6)
			
 
				+            trees = installation.get_trees_for_org()
			
 
				+            assert trees == expected_trees  # xyz will have files but not foo
			
 
				+
			
 
				+            # Another call should not make us loose the files for xyz
			
 
				+            self.set_rate_limit(remaining=5)
			
 
				+            trees = installation.get_trees_for_org()
			
 
				+            assert trees == expected_trees  # xyz will have files but not foo
			
 
				+
			
 
				+            # We reset the remaining values
			
 
				+            self.set_rate_limit(remaining=20)
			
 
				+            trees = installation.get_trees_for_org()
			
 
				+            # Now that the rate limit is reset we should get files for foo
			
 
				+            expected_trees[f"{gh_org}/foo"] = RepoTree(
			
 
				+                Repo(f"{gh_org}/foo", "master"), ["src/sentry/api/endpoints/auth_login.py"]
			
 
				+            )
			
 
				+            assert trees == expected_trees