Browse Source

feat(derived_code_mappings): Prevent rate limit API exhaustion (#44110)

One of our customers is intermittently exhausting their Github API rate limit and affecting the commit_context task.

Changes included:

* get_repositories will not make API requests if the remaining requests is 200 or less. We will keep fetching repositories in the next hour when the limit is reset.
* Spread caching of repositories across 24 hours. This will make the task only use 1/24th of the required API requests.

Fixes [SENTRY-Y3M](https://sentry.sentry.io/issues/3900033302/)
Armen Zambrano G 2 years ago
parent
commit
1c4b690d71

+ 52 - 7
src/sentry/integrations/github/client.py

@@ -18,6 +18,11 @@ from sentry.utils.json import JSONData
 
 logger = logging.getLogger("sentry.integrations.github")
 
+# Some functions that require a large number of API requests can use this value
+# as the lower ceiling before hitting Github anymore, thus, leaving at least these
+# many requests left for other features that need to reach Github
+MINIMUM_REQUESTS = 200
+
 
 class GithubRateLimitInfo:
     def __init__(self, info: Dict[str, int]) -> None:
@@ -132,6 +137,7 @@ class GitHubClientMixin(ApiClient):  # type: ignore
         repo_full_name: str,
         tree_sha: str,
         only_source_code_files: bool = True,
+        only_use_cache: bool = False,
         cache_seconds: int = 3600 * 24,
     ) -> List[str]:
         """It return all files for a repo or just source code files.
@@ -139,10 +145,13 @@ class GitHubClientMixin(ApiClient):  # type: ignore
         repo_full_name: e.g. getsentry/sentry
         tree_sha: A branch or a commit sha
         only_source_code_files: Include all files or just the source code files
+        only_use_cache: Do not hit the network but use the value from the cache
+            if any. This is useful if the remaining API requests are low
+        cache_seconds: How long to cache a value for
         """
         key = f"github:repo:{repo_full_name}:{'source-code' if only_source_code_files else 'all'}"
         repo_files: List[str] = cache.get(key, [])
-        if not repo_files:
+        if not repo_files and not only_use_cache:
             tree = self.get_tree(repo_full_name, tree_sha)
             if tree:
                 # Keep files; discard directories
@@ -169,10 +178,12 @@ class GitHubClientMixin(ApiClient):  # type: ignore
         extra = {"gh_org": gh_org}
         try:
             repositories = self._populate_repositories(gh_org, cache_seconds)
-            # XXX: In order to speed up this function we will need to parallelize this
-            # Use ThreadPoolExecutor; see src/sentry/utils/snuba.py#L358
-            for repo_info in repositories:
-                trees[repo_info["full_name"]] = self._populate_tree(repo_info)
+            trees = self._populate_trees(repositories)
+
+            rate_limit = self.get_rate_limit()
+            extra.update(
+                {"remaining": str(rate_limit.remaining), "repos_num": str(len(repositories))}
+            )
             logger.info("Using cached trees for Github org.", extra=extra)
         except ApiError as error:
             msg = error.text
@@ -205,10 +216,44 @@ class GitHubClientMixin(ApiClient):  # type: ignore
 
         return repositories
 
-    def _populate_tree(self, repo_info: Dict[str, str]) -> RepoTree:
+    def _populate_trees(self, repositories: List[Dict[str, str]]) -> Dict[str, RepoTree]:
+        """
+        For every repository, fetch the tree associated and cache it.
+        This function takes API rate limits into consideration to prevent exhaustion.
+        """
+        trees: Dict[str, RepoTree] = {}
+        only_use_cache = False
+
+        rate_limit = self.get_rate_limit()
+        remaining_requests = rate_limit.remaining
+        logger.info("Current rate limit info.", extra={"rate_limit": rate_limit})
+
+        for index, repo_info in enumerate(repositories):
+            # Only use the cache if we drop below the lower ceiling
+            # We will fetch after the limit is reset (every hour)
+            if not only_use_cache and remaining_requests <= MINIMUM_REQUESTS:
+                only_use_cache = True
+                logger.info(
+                    "Too few requests remaining. Grabbing values from the cache.",
+                    extra={"repo_info": repo_info},
+                )
+            # The Github API rate limit is reset every hour
+            # Spread the expiration of the cache of each repo across the day
+            trees[repo_info["full_name"]] = self._populate_tree(
+                repo_info, only_use_cache, (3600 * 24) + (3600 * (index % 24))
+            )
+            remaining_requests -= 1
+
+        return trees
+
+    def _populate_tree(
+        self, repo_info: Dict[str, str], only_use_cache: bool, cache_seconds: int
+    ) -> RepoTree:
         full_name = repo_info["full_name"]
         branch = repo_info["default_branch"]
-        repo_files = self.get_cached_repo_files(full_name, branch)
+        repo_files = self.get_cached_repo_files(
+            full_name, branch, only_use_cache=only_use_cache, cache_seconds=cache_seconds
+        )
         return RepoTree(Repo(full_name, branch), repo_files)
 
     def get_repositories(self, fetch_max_pages: bool = False) -> Sequence[JSONData]:

+ 3 - 1
src/sentry/integrations/github/integration.py

@@ -118,7 +118,9 @@ class GitHubIntegration(IntegrationInstallation, GitHubIssueBasic, RepositoryMix
             id=self.org_integration.organization_id, user_id=None
         )
         if not organization_context:
-            logger.exception("No organization information was found.", extra=extra)
+            logger.exception(
+                "No organization information was found. Continuing execution.", extra=extra
+            )
         else:
             trees = self.get_client().get_trees_for_org(gh_org=gh_org, cache_seconds=cache_seconds)
 

+ 1 - 1
tests/sentry/integrations/github/test_client.py

@@ -76,7 +76,7 @@ class GitHubAppsClientTest(TestCase):
             assert gh_rate_limit.used == 7
             assert gh_rate_limit.next_window() == "17:39:49"
 
-    def test_get_rate_limit_specific_resouces(self):
+    def test_get_rate_limit_non_existant_resouce(self):
         with pytest.raises(AssertionError):
             self.client.get_rate_limit("foo")
 

+ 52 - 33
tests/sentry/integrations/github/test_integration.py

@@ -6,7 +6,7 @@ from django.urls import reverse
 
 import sentry
 from sentry.constants import ObjectStatus
-from sentry.integrations.github import API_ERRORS, GitHubIntegrationProvider
+from sentry.integrations.github import API_ERRORS, MINIMUM_REQUESTS, GitHubIntegrationProvider
 from sentry.integrations.utils.code_mapping import Repo, RepoTree
 from sentry.models import Integration, OrganizationIntegration, Project, Repository
 from sentry.plugins.base import plugins
@@ -585,6 +585,18 @@ class GitHubIntegrationTest(IntegrationTestCase):
             organization=self.organization, integration=integration
         ).exists()
 
+    def set_rate_limit(self, remaining, limit=5000):
+        """Helper class to set the rate limit"""
+        responses.add(
+            method=responses.GET,
+            url="https://api.github.com/rate_limit",
+            json={
+                "resources": {
+                    "core": {"limit": limit, "remaining": remaining, "used": "foo", "reset": 123}
+                }
+            },
+        )
+
     def get_installation_helper(self):
         with self.tasks():
             self.assert_setup_flow()  # This somehow creates the integration
@@ -593,36 +605,11 @@ class GitHubIntegrationTest(IntegrationTestCase):
         installation = integration.get_installation(self.organization.id)
         return installation
 
-    @responses.activate
-    def test_get_trees_for_org_handles_rate_limit_reached(self):
-        """Test that we will not hit Github's API more than once when we reach the API rate limit"""
-        responses.replace(
-            responses.GET,
-            f"{self.base_url}/repos/Test-Organization/foo/git/trees/master?recursive=1",
-            json={"message": "API rate limit exceeded for installation ID 123456."},
-            status=403,
-        )
-
-        installation = self.get_installation_helper()
-        trees = installation.get_trees_for_org()
-        key_prefix = "github:repo:Test-Organization"
-        # We have access to the files because the rate limit has not been hit
-        assert cache.get(f"{key_prefix}/xyz:source-code") == ["src/foo.py"]
-        # These repos are None because foo hit the rate limit
-        for repo in ("foo", "bar", "baz"):
-            assert cache.get(f"{key_prefix}/{repo}:source-code") is None
-        assert len(trees.keys()) == 1
-
-        assert trees == {
-            "Test-Organization/xyz": RepoTree(
-                Repo("Test-Organization/xyz", "master"), ["src/foo.py"]
-            )
-        }
-
     @responses.activate
     def test_get_trees_for_org_works(self):
         """Fetch the tree representation of a repo"""
         installation = self.get_installation_helper()
+        self.set_rate_limit(MINIMUM_REQUESTS + 50)
         expected_trees = {
             "Test-Organization/bar": RepoTree(Repo("Test-Organization/bar", "main"), []),
             "Test-Organization/baz": RepoTree(Repo("Test-Organization/baz", "master"), []),
@@ -635,22 +622,54 @@ class GitHubIntegrationTest(IntegrationTestCase):
             ),
         }
 
-        assert not cache.get("githubtrees:repositories:Test-Organization")
+        repos_key = "githubtrees:repositories:Test-Organization"
+        repo_key = lambda x: f"github:repo:Test-Organization/{x}:source-code"
         # Check that the cache is clear
-        repo_key = "github:repo:Test-Organization/foo:source-code"
-        assert cache.get("githubtrees:repositories:Test-Organization") is None
-        assert cache.get(repo_key) is None
+        assert cache.get(repos_key) is None
+        assert cache.get(repo_key("foo")) is None
         trees = installation.get_trees_for_org()
 
-        assert cache.get("githubtrees:repositories:Test-Organization") == [
+        assert cache.get(repos_key) == [
             {"full_name": "Test-Organization/xyz", "default_branch": "master"},
             {"full_name": "Test-Organization/foo", "default_branch": "master"},
             {"full_name": "Test-Organization/bar", "default_branch": "main"},
             {"full_name": "Test-Organization/baz", "default_branch": "master"},
         ]
-        assert cache.get(repo_key) == ["src/sentry/api/endpoints/auth_login.py"]
+        assert cache.get(repo_key("foo")) == ["src/sentry/api/endpoints/auth_login.py"]
         assert trees == expected_trees
 
         # Calling a second time should produce the same results
         trees = installation.get_trees_for_org()
         assert trees == expected_trees
+
+    @responses.activate
+    def test_get_trees_for_org_prevent_exhaustion_some_repos(self):
+        """Some repos will hit the network but the rest will grab from the cache."""
+        gh_org = "Test-Organization"
+        installation = self.get_installation_helper()
+        expected_trees = {
+            f"{gh_org}/xyz": RepoTree(Repo(f"{gh_org}/xyz", "master"), ["src/foo.py"]),
+            # This will have no files because we will hit the minimum remaining requests floor
+            f"{gh_org}/foo": RepoTree(Repo(f"{gh_org}/foo", "master"), []),
+            f"{gh_org}/bar": RepoTree(Repo(f"{gh_org}/bar", "main"), []),
+            f"{gh_org}/baz": RepoTree(Repo(f"{gh_org}/baz", "master"), []),
+        }
+        with patch("sentry.integrations.github.client.MINIMUM_REQUESTS", new=5, autospec=False):
+            # We start with one request left before reaching the minimum remaining requests floor
+            self.set_rate_limit(remaining=6)
+            trees = installation.get_trees_for_org()
+            assert trees == expected_trees  # xyz will have files but not foo
+
+            # Another call should not make us loose the files for xyz
+            self.set_rate_limit(remaining=5)
+            trees = installation.get_trees_for_org()
+            assert trees == expected_trees  # xyz will have files but not foo
+
+            # We reset the remaining values
+            self.set_rate_limit(remaining=20)
+            trees = installation.get_trees_for_org()
+            # Now that the rate limit is reset we should get files for foo
+            expected_trees[f"{gh_org}/foo"] = RepoTree(
+                Repo(f"{gh_org}/foo", "master"), ["src/sentry/api/endpoints/auth_login.py"]
+            )
+            assert trees == expected_trees