robotparser.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. """ robotparser.py
  2. Copyright (C) 2000 Bastian Kleineidam
  3. You can choose between two licenses when using this package:
  4. 1) GNU GPLv2
  5. 2) PSF license for Python 2.2
  6. The robots.txt Exclusion Protocol is implemented as specified in
  7. http://www.robotstxt.org/norobots-rfc.txt
  8. """
  9. import collections
  10. import urllib.error
  11. import urllib.parse
  12. import urllib.request
  13. __all__ = ["RobotFileParser"]
  14. RequestRate = collections.namedtuple("RequestRate", "requests seconds")
  15. class RobotFileParser:
  16. """ This class provides a set of methods to read, parse and answer
  17. questions about a single robots.txt file.
  18. """
  19. def __init__(self, url=''):
  20. self.entries = []
  21. self.sitemaps = []
  22. self.default_entry = None
  23. self.disallow_all = False
  24. self.allow_all = False
  25. self.set_url(url)
  26. self.last_checked = 0
  27. def mtime(self):
  28. """Returns the time the robots.txt file was last fetched.
  29. This is useful for long-running web spiders that need to
  30. check for new robots.txt files periodically.
  31. """
  32. return self.last_checked
  33. def modified(self):
  34. """Sets the time the robots.txt file was last fetched to the
  35. current time.
  36. """
  37. import time
  38. self.last_checked = time.time()
  39. def set_url(self, url):
  40. """Sets the URL referring to a robots.txt file."""
  41. self.url = url
  42. self.host, self.path = urllib.parse.urlparse(url)[1:3]
  43. def read(self):
  44. """Reads the robots.txt URL and feeds it to the parser."""
  45. try:
  46. f = urllib.request.urlopen(self.url)
  47. except urllib.error.HTTPError as err:
  48. if err.code in (401, 403):
  49. self.disallow_all = True
  50. elif err.code >= 400 and err.code < 500:
  51. self.allow_all = True
  52. err.close()
  53. else:
  54. raw = f.read()
  55. self.parse(raw.decode("utf-8").splitlines())
  56. def _add_entry(self, entry):
  57. if "*" in entry.useragents:
  58. # the default entry is considered last
  59. if self.default_entry is None:
  60. # the first default entry wins
  61. self.default_entry = entry
  62. else:
  63. self.entries.append(entry)
  64. def parse(self, lines):
  65. """Parse the input lines from a robots.txt file.
  66. We allow that a user-agent: line is not preceded by
  67. one or more blank lines.
  68. """
  69. # states:
  70. # 0: start state
  71. # 1: saw user-agent line
  72. # 2: saw an allow or disallow line
  73. state = 0
  74. entry = Entry()
  75. self.modified()
  76. for line in lines:
  77. if not line:
  78. if state == 1:
  79. entry = Entry()
  80. state = 0
  81. elif state == 2:
  82. self._add_entry(entry)
  83. entry = Entry()
  84. state = 0
  85. # remove optional comment and strip line
  86. i = line.find('#')
  87. if i >= 0:
  88. line = line[:i]
  89. line = line.strip()
  90. if not line:
  91. continue
  92. line = line.split(':', 1)
  93. if len(line) == 2:
  94. line[0] = line[0].strip().lower()
  95. line[1] = urllib.parse.unquote(line[1].strip())
  96. if line[0] == "user-agent":
  97. if state == 2:
  98. self._add_entry(entry)
  99. entry = Entry()
  100. entry.useragents.append(line[1])
  101. state = 1
  102. elif line[0] == "disallow":
  103. if state != 0:
  104. entry.rulelines.append(RuleLine(line[1], False))
  105. state = 2
  106. elif line[0] == "allow":
  107. if state != 0:
  108. entry.rulelines.append(RuleLine(line[1], True))
  109. state = 2
  110. elif line[0] == "crawl-delay":
  111. if state != 0:
  112. # before trying to convert to int we need to make
  113. # sure that robots.txt has valid syntax otherwise
  114. # it will crash
  115. if line[1].strip().isdigit():
  116. entry.delay = int(line[1])
  117. state = 2
  118. elif line[0] == "request-rate":
  119. if state != 0:
  120. numbers = line[1].split('/')
  121. # check if all values are sane
  122. if (len(numbers) == 2 and numbers[0].strip().isdigit()
  123. and numbers[1].strip().isdigit()):
  124. entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
  125. state = 2
  126. elif line[0] == "sitemap":
  127. # According to http://www.sitemaps.org/protocol.html
  128. # "This directive is independent of the user-agent line,
  129. # so it doesn't matter where you place it in your file."
  130. # Therefore we do not change the state of the parser.
  131. self.sitemaps.append(line[1])
  132. if state == 2:
  133. self._add_entry(entry)
  134. def can_fetch(self, useragent, url):
  135. """using the parsed robots.txt decide if useragent can fetch url"""
  136. if self.disallow_all:
  137. return False
  138. if self.allow_all:
  139. return True
  140. # Until the robots.txt file has been read or found not
  141. # to exist, we must assume that no url is allowable.
  142. # This prevents false positives when a user erroneously
  143. # calls can_fetch() before calling read().
  144. if not self.last_checked:
  145. return False
  146. # search for given user agent matches
  147. # the first match counts
  148. parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
  149. url = urllib.parse.urlunparse(('','',parsed_url.path,
  150. parsed_url.params,parsed_url.query, parsed_url.fragment))
  151. url = urllib.parse.quote(url)
  152. if not url:
  153. url = "/"
  154. for entry in self.entries:
  155. if entry.applies_to(useragent):
  156. return entry.allowance(url)
  157. # try the default entry last
  158. if self.default_entry:
  159. return self.default_entry.allowance(url)
  160. # agent not found ==> access granted
  161. return True
  162. def crawl_delay(self, useragent):
  163. if not self.mtime():
  164. return None
  165. for entry in self.entries:
  166. if entry.applies_to(useragent):
  167. return entry.delay
  168. if self.default_entry:
  169. return self.default_entry.delay
  170. return None
  171. def request_rate(self, useragent):
  172. if not self.mtime():
  173. return None
  174. for entry in self.entries:
  175. if entry.applies_to(useragent):
  176. return entry.req_rate
  177. if self.default_entry:
  178. return self.default_entry.req_rate
  179. return None
  180. def site_maps(self):
  181. if not self.sitemaps:
  182. return None
  183. return self.sitemaps
  184. def __str__(self):
  185. entries = self.entries
  186. if self.default_entry is not None:
  187. entries = entries + [self.default_entry]
  188. return '\n\n'.join(map(str, entries))
  189. class RuleLine:
  190. """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
  191. (allowance==False) followed by a path."""
  192. def __init__(self, path, allowance):
  193. if path == '' and not allowance:
  194. # an empty value means allow all
  195. allowance = True
  196. path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
  197. self.path = urllib.parse.quote(path)
  198. self.allowance = allowance
  199. def applies_to(self, filename):
  200. return self.path == "*" or filename.startswith(self.path)
  201. def __str__(self):
  202. return ("Allow" if self.allowance else "Disallow") + ": " + self.path
  203. class Entry:
  204. """An entry has one or more user-agents and zero or more rulelines"""
  205. def __init__(self):
  206. self.useragents = []
  207. self.rulelines = []
  208. self.delay = None
  209. self.req_rate = None
  210. def __str__(self):
  211. ret = []
  212. for agent in self.useragents:
  213. ret.append(f"User-agent: {agent}")
  214. if self.delay is not None:
  215. ret.append(f"Crawl-delay: {self.delay}")
  216. if self.req_rate is not None:
  217. rate = self.req_rate
  218. ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
  219. ret.extend(map(str, self.rulelines))
  220. return '\n'.join(ret)
  221. def applies_to(self, useragent):
  222. """check if this entry applies to the specified agent"""
  223. # split the name token and make it lower case
  224. useragent = useragent.split("/")[0].lower()
  225. for agent in self.useragents:
  226. if agent == '*':
  227. # we have the catch-all agent
  228. return True
  229. agent = agent.lower()
  230. if agent in useragent:
  231. return True
  232. return False
  233. def allowance(self, filename):
  234. """Preconditions:
  235. - our agent applies to this entry
  236. - filename is URL decoded"""
  237. for line in self.rulelines:
  238. if line.applies_to(filename):
  239. return line.allowance
  240. return True