9.2 KB

  1. """
  2. Copyright (C) 2000 Bastian Kleineidam
  3. You can choose between two licenses when using this package:
  4. 1) GNU GPLv2
  5. 2) PSF license for Python 2.2
  6. The robots.txt Exclusion Protocol is implemented as specified in
  8. """
  9. import collections
  10. import urllib.error
  11. import urllib.parse
  12. import urllib.request
  13. __all__ = ["RobotFileParser"]
  14. RequestRate = collections.namedtuple("RequestRate", "requests seconds")
  15. class RobotFileParser:
  16. """ This class provides a set of methods to read, parse and answer
  17. questions about a single robots.txt file.
  18. """
  19. def __init__(self, url=''):
  20. self.entries = []
  21. self.sitemaps = []
  22. self.default_entry = None
  23. self.disallow_all = False
  24. self.allow_all = False
  25. self.set_url(url)
  26. self.last_checked = 0
  27. def mtime(self):
  28. """Returns the time the robots.txt file was last fetched.
  29. This is useful for long-running web spiders that need to
  30. check for new robots.txt files periodically.
  31. """
  32. return self.last_checked
  33. def modified(self):
  34. """Sets the time the robots.txt file was last fetched to the
  35. current time.
  36. """
  37. import time
  38. self.last_checked = time.time()
  39. def set_url(self, url):
  40. """Sets the URL referring to a robots.txt file."""
  41. self.url = url
  42., self.path = urllib.parse.urlparse(url)[1:3]
  43. def read(self):
  44. """Reads the robots.txt URL and feeds it to the parser."""
  45. try:
  46. f = urllib.request.urlopen(self.url)
  47. except urllib.error.HTTPError as err:
  48. if err.code in (401, 403):
  49. self.disallow_all = True
  50. elif err.code >= 400 and err.code < 500:
  51. self.allow_all = True
  52. err.close()
  53. else:
  54. raw =
  55. self.parse(raw.decode("utf-8").splitlines())
  56. def _add_entry(self, entry):
  57. if "*" in entry.useragents:
  58. # the default entry is considered last
  59. if self.default_entry is None:
  60. # the first default entry wins
  61. self.default_entry = entry
  62. else:
  63. self.entries.append(entry)
  64. def parse(self, lines):
  65. """Parse the input lines from a robots.txt file.
  66. We allow that a user-agent: line is not preceded by
  67. one or more blank lines.
  68. """
  69. # states:
  70. # 0: start state
  71. # 1: saw user-agent line
  72. # 2: saw an allow or disallow line
  73. state = 0
  74. entry = Entry()
  75. self.modified()
  76. for line in lines:
  77. if not line:
  78. if state == 1:
  79. entry = Entry()
  80. state = 0
  81. elif state == 2:
  82. self._add_entry(entry)
  83. entry = Entry()
  84. state = 0
  85. # remove optional comment and strip line
  86. i = line.find('#')
  87. if i >= 0:
  88. line = line[:i]
  89. line = line.strip()
  90. if not line:
  91. continue
  92. line = line.split(':', 1)
  93. if len(line) == 2:
  94. line[0] = line[0].strip().lower()
  95. line[1] = urllib.parse.unquote(line[1].strip())
  96. if line[0] == "user-agent":
  97. if state == 2:
  98. self._add_entry(entry)
  99. entry = Entry()
  100. entry.useragents.append(line[1])
  101. state = 1
  102. elif line[0] == "disallow":
  103. if state != 0:
  104. entry.rulelines.append(RuleLine(line[1], False))
  105. state = 2
  106. elif line[0] == "allow":
  107. if state != 0:
  108. entry.rulelines.append(RuleLine(line[1], True))
  109. state = 2
  110. elif line[0] == "crawl-delay":
  111. if state != 0:
  112. # before trying to convert to int we need to make
  113. # sure that robots.txt has valid syntax otherwise
  114. # it will crash
  115. if line[1].strip().isdigit():
  116. entry.delay = int(line[1])
  117. state = 2
  118. elif line[0] == "request-rate":
  119. if state != 0:
  120. numbers = line[1].split('/')
  121. # check if all values are sane
  122. if (len(numbers) == 2 and numbers[0].strip().isdigit()
  123. and numbers[1].strip().isdigit()):
  124. entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
  125. state = 2
  126. elif line[0] == "sitemap":
  127. # According to
  128. # "This directive is independent of the user-agent line,
  129. # so it doesn't matter where you place it in your file."
  130. # Therefore we do not change the state of the parser.
  131. self.sitemaps.append(line[1])
  132. if state == 2:
  133. self._add_entry(entry)
  134. def can_fetch(self, useragent, url):
  135. """using the parsed robots.txt decide if useragent can fetch url"""
  136. if self.disallow_all:
  137. return False
  138. if self.allow_all:
  139. return True
  140. # Until the robots.txt file has been read or found not
  141. # to exist, we must assume that no url is allowable.
  142. # This prevents false positives when a user erroneously
  143. # calls can_fetch() before calling read().
  144. if not self.last_checked:
  145. return False
  146. # search for given user agent matches
  147. # the first match counts
  148. parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
  149. url = urllib.parse.urlunparse(('','',parsed_url.path,
  150. parsed_url.params,parsed_url.query, parsed_url.fragment))
  151. url = urllib.parse.quote(url)
  152. if not url:
  153. url = "/"
  154. for entry in self.entries:
  155. if entry.applies_to(useragent):
  156. return entry.allowance(url)
  157. # try the default entry last
  158. if self.default_entry:
  159. return self.default_entry.allowance(url)
  160. # agent not found ==> access granted
  161. return True
  162. def crawl_delay(self, useragent):
  163. if not self.mtime():
  164. return None
  165. for entry in self.entries:
  166. if entry.applies_to(useragent):
  167. return entry.delay
  168. if self.default_entry:
  169. return self.default_entry.delay
  170. return None
  171. def request_rate(self, useragent):
  172. if not self.mtime():
  173. return None
  174. for entry in self.entries:
  175. if entry.applies_to(useragent):
  176. return entry.req_rate
  177. if self.default_entry:
  178. return self.default_entry.req_rate
  179. return None
  180. def site_maps(self):
  181. if not self.sitemaps:
  182. return None
  183. return self.sitemaps
  184. def __str__(self):
  185. entries = self.entries
  186. if self.default_entry is not None:
  187. entries = entries + [self.default_entry]
  188. return '\n\n'.join(map(str, entries))
  189. class RuleLine:
  190. """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
  191. (allowance==False) followed by a path."""
  192. def __init__(self, path, allowance):
  193. if path == '' and not allowance:
  194. # an empty value means allow all
  195. allowance = True
  196. path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
  197. self.path = urllib.parse.quote(path)
  198. self.allowance = allowance
  199. def applies_to(self, filename):
  200. return self.path == "*" or filename.startswith(self.path)
  201. def __str__(self):
  202. return ("Allow" if self.allowance else "Disallow") + ": " + self.path
  203. class Entry:
  204. """An entry has one or more user-agents and zero or more rulelines"""
  205. def __init__(self):
  206. self.useragents = []
  207. self.rulelines = []
  208. self.delay = None
  209. self.req_rate = None
  210. def __str__(self):
  211. ret = []
  212. for agent in self.useragents:
  213. ret.append(f"User-agent: {agent}")
  214. if self.delay is not None:
  215. ret.append(f"Crawl-delay: {self.delay}")
  216. if self.req_rate is not None:
  217. rate = self.req_rate
  218. ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
  219. ret.extend(map(str, self.rulelines))
  220. return '\n'.join(ret)
  221. def applies_to(self, useragent):
  222. """check if this entry applies to the specified agent"""
  223. # split the name token and make it lower case
  224. useragent = useragent.split("/")[0].lower()
  225. for agent in self.useragents:
  226. if agent == '*':
  227. # we have the catch-all agent
  228. return True
  229. agent = agent.lower()
  230. if agent in useragent:
  231. return True
  232. return False
  233. def allowance(self, filename):
  234. """Preconditions:
  235. - our agent applies to this entry
  236. - filename is URL decoded"""
  237. for line in self.rulelines:
  238. if line.applies_to(filename):
  239. return line.allowance
  240. return True