test_robotparser.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. import io
  2. import os
  3. import threading
  4. import unittest
  5. import urllib.robotparser
  6. from test import support
  7. from test.support import socket_helper
  8. from test.support import threading_helper
  9. from http.server import BaseHTTPRequestHandler, HTTPServer
  10. class BaseRobotTest:
  11. robots_txt = ''
  12. agent = 'test_robotparser'
  13. good = []
  14. bad = []
  15. site_maps = None
  16. def setUp(self):
  17. lines = io.StringIO(self.robots_txt).readlines()
  18. self.parser = urllib.robotparser.RobotFileParser()
  19. self.parser.parse(lines)
  20. def get_agent_and_url(self, url):
  21. if isinstance(url, tuple):
  22. agent, url = url
  23. return agent, url
  24. return self.agent, url
  25. def test_good_urls(self):
  26. for url in self.good:
  27. agent, url = self.get_agent_and_url(url)
  28. with self.subTest(url=url, agent=agent):
  29. self.assertTrue(self.parser.can_fetch(agent, url))
  30. def test_bad_urls(self):
  31. for url in self.bad:
  32. agent, url = self.get_agent_and_url(url)
  33. with self.subTest(url=url, agent=agent):
  34. self.assertFalse(self.parser.can_fetch(agent, url))
  35. def test_site_maps(self):
  36. self.assertEqual(self.parser.site_maps(), self.site_maps)
  37. class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
  38. robots_txt = """\
  39. User-agent: *
  40. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  41. Disallow: /tmp/ # these will soon disappear
  42. Disallow: /foo.html
  43. """
  44. good = ['/', '/test.html']
  45. bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
  46. class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
  47. robots_txt = """\
  48. # robots.txt for http://www.example.com/
  49. User-agent: *
  50. Crawl-delay: 1
  51. Request-rate: 3/15
  52. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  53. # Cybermapper knows where to go.
  54. User-agent: cybermapper
  55. Disallow:
  56. """
  57. good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
  58. bad = ['/cyberworld/map/index.html']
  59. class SitemapTest(BaseRobotTest, unittest.TestCase):
  60. robots_txt = """\
  61. # robots.txt for http://www.example.com/
  62. User-agent: *
  63. Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
  64. Sitemap: http://www.google.com/hostednews/sitemap_index.xml
  65. Request-rate: 3/15
  66. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  67. """
  68. good = ['/', '/test.html']
  69. bad = ['/cyberworld/map/index.html']
  70. site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
  71. 'http://www.google.com/hostednews/sitemap_index.xml']
  72. class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
  73. robots_txt = """\
  74. # go away
  75. User-agent: *
  76. Disallow: /
  77. """
  78. good = []
  79. bad = ['/cyberworld/map/index.html', '/', '/tmp/']
  80. class BaseRequestRateTest(BaseRobotTest):
  81. request_rate = None
  82. crawl_delay = None
  83. def test_request_rate(self):
  84. parser = self.parser
  85. for url in self.good + self.bad:
  86. agent, url = self.get_agent_and_url(url)
  87. with self.subTest(url=url, agent=agent):
  88. self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
  89. parsed_request_rate = parser.request_rate(agent)
  90. self.assertEqual(parsed_request_rate, self.request_rate)
  91. if self.request_rate is not None:
  92. self.assertIsInstance(
  93. parsed_request_rate,
  94. urllib.robotparser.RequestRate
  95. )
  96. self.assertEqual(
  97. parsed_request_rate.requests,
  98. self.request_rate.requests
  99. )
  100. self.assertEqual(
  101. parsed_request_rate.seconds,
  102. self.request_rate.seconds
  103. )
  104. class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
  105. robots_txt = ''
  106. good = ['/foo']
  107. class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
  108. robots_txt = """\
  109. User-agent: figtree
  110. Crawl-delay: 3
  111. Request-rate: 9/30
  112. Disallow: /tmp
  113. Disallow: /a%3cd.html
  114. Disallow: /a%2fb.html
  115. Disallow: /%7ejoe/index.html
  116. """
  117. agent = 'figtree'
  118. request_rate = urllib.robotparser.RequestRate(9, 30)
  119. crawl_delay = 3
  120. good = [('figtree', '/foo.html')]
  121. bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
  122. '/a%2fb.html', '/~joe/index.html']
  123. class DifferentAgentTest(CrawlDelayAndRequestRateTest):
  124. agent = 'FigTree Robot libwww-perl/5.04'
  125. class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
  126. robots_txt = """\
  127. User-agent: *
  128. Disallow: /tmp/
  129. Disallow: /a%3Cd.html
  130. Disallow: /a/b.html
  131. Disallow: /%7ejoe/index.html
  132. Crawl-delay: 3
  133. Request-rate: 9/banana
  134. """
  135. good = ['/tmp']
  136. bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
  137. '/%7Ejoe/index.html']
  138. crawl_delay = 3
  139. class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
  140. # From bug report #523041
  141. robots_txt = """\
  142. User-Agent: *
  143. Disallow: /.
  144. Crawl-delay: pears
  145. """
  146. good = ['/foo.html']
  147. # bug report says "/" should be denied, but that is not in the RFC
  148. bad = []
  149. class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
  150. # also test that Allow and Diasallow works well with each other
  151. robots_txt = """\
  152. User-agent: Googlebot
  153. Allow: /folder1/myfile.html
  154. Disallow: /folder1/
  155. Request-rate: whale/banana
  156. """
  157. agent = 'Googlebot'
  158. good = ['/folder1/myfile.html']
  159. bad = ['/folder1/anotherfile.html']
  160. class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
  161. # the order of User-agent should be correct. note
  162. # that this file is incorrect because "Googlebot" is a
  163. # substring of "Googlebot-Mobile"
  164. robots_txt = """\
  165. User-agent: Googlebot
  166. Disallow: /
  167. User-agent: Googlebot-Mobile
  168. Allow: /
  169. """
  170. agent = 'Googlebot'
  171. bad = ['/something.jpg']
  172. class UserAgentGoogleMobileTest(UserAgentOrderingTest):
  173. agent = 'Googlebot-Mobile'
  174. class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
  175. # Google also got the order wrong. You need
  176. # to specify the URLs from more specific to more general
  177. robots_txt = """\
  178. User-agent: Googlebot
  179. Allow: /folder1/myfile.html
  180. Disallow: /folder1/
  181. """
  182. agent = 'googlebot'
  183. good = ['/folder1/myfile.html']
  184. bad = ['/folder1/anotherfile.html']
  185. class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
  186. # see issue #6325 for details
  187. robots_txt = """\
  188. User-agent: *
  189. Disallow: /some/path?name=value
  190. """
  191. good = ['/some/path']
  192. bad = ['/some/path?name=value']
  193. class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
  194. # obey first * entry (#4108)
  195. robots_txt = """\
  196. User-agent: *
  197. Disallow: /some/path
  198. User-agent: *
  199. Disallow: /another/path
  200. """
  201. good = ['/another/path']
  202. bad = ['/some/path']
  203. class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
  204. # normalize the URL first (#17403)
  205. robots_txt = """\
  206. User-agent: *
  207. Allow: /some/path?
  208. Disallow: /another/path?
  209. """
  210. good = ['/some/path?']
  211. bad = ['/another/path?']
  212. class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
  213. robots_txt = """\
  214. User-agent: *
  215. Crawl-delay: 1
  216. Request-rate: 3/15
  217. Disallow: /cyberworld/map/
  218. """
  219. request_rate = urllib.robotparser.RequestRate(3, 15)
  220. crawl_delay = 1
  221. good = ['/', '/test.html']
  222. bad = ['/cyberworld/map/index.html']
  223. class StringFormattingTest(BaseRobotTest, unittest.TestCase):
  224. robots_txt = """\
  225. User-agent: *
  226. Crawl-delay: 1
  227. Request-rate: 3/15
  228. Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  229. # Cybermapper knows where to go.
  230. User-agent: cybermapper
  231. Disallow: /some/path
  232. """
  233. expected_output = """\
  234. User-agent: cybermapper
  235. Disallow: /some/path
  236. User-agent: *
  237. Crawl-delay: 1
  238. Request-rate: 3/15
  239. Disallow: /cyberworld/map/\
  240. """
  241. def test_string_formatting(self):
  242. self.assertEqual(str(self.parser), self.expected_output)
  243. class RobotHandler(BaseHTTPRequestHandler):
  244. def do_GET(self):
  245. self.send_error(403, "Forbidden access")
  246. def log_message(self, format, *args):
  247. pass
  248. @unittest.skipUnless(
  249. support.has_socket_support,
  250. "Socket server requires working socket."
  251. )
  252. class PasswordProtectedSiteTestCase(unittest.TestCase):
  253. def setUp(self):
  254. # clear _opener global variable
  255. self.addCleanup(urllib.request.urlcleanup)
  256. self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
  257. self.t = threading.Thread(
  258. name='HTTPServer serving',
  259. target=self.server.serve_forever,
  260. # Short poll interval to make the test finish quickly.
  261. # Time between requests is short enough that we won't wake
  262. # up spuriously too many times.
  263. kwargs={'poll_interval':0.01})
  264. self.t.daemon = True # In case this function raises.
  265. self.t.start()
  266. def tearDown(self):
  267. self.server.shutdown()
  268. self.t.join()
  269. self.server.server_close()
  270. @threading_helper.reap_threads
  271. def testPasswordProtectedSite(self):
  272. addr = self.server.server_address
  273. url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
  274. robots_url = url + "/robots.txt"
  275. parser = urllib.robotparser.RobotFileParser()
  276. parser.set_url(url)
  277. parser.read()
  278. self.assertFalse(parser.can_fetch("*", robots_url))
  279. @support.requires_working_socket()
  280. class NetworkTestCase(unittest.TestCase):
  281. base_url = 'http://www.pythontest.net/'
  282. robots_txt = '{}elsewhere/robots.txt'.format(base_url)
  283. @classmethod
  284. def setUpClass(cls):
  285. support.requires('network')
  286. with socket_helper.transient_internet(cls.base_url):
  287. cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
  288. cls.parser.read()
  289. def url(self, path):
  290. return '{}{}{}'.format(
  291. self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
  292. )
  293. def test_basic(self):
  294. self.assertFalse(self.parser.disallow_all)
  295. self.assertFalse(self.parser.allow_all)
  296. self.assertGreater(self.parser.mtime(), 0)
  297. self.assertFalse(self.parser.crawl_delay('*'))
  298. self.assertFalse(self.parser.request_rate('*'))
  299. def test_can_fetch(self):
  300. self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
  301. self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
  302. self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
  303. self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
  304. self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
  305. self.assertTrue(self.parser.can_fetch('*', self.base_url))
  306. def test_read_404(self):
  307. parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
  308. parser.read()
  309. self.assertTrue(parser.allow_all)
  310. self.assertFalse(parser.disallow_all)
  311. self.assertEqual(parser.mtime(), 0)
  312. self.assertIsNone(parser.crawl_delay('*'))
  313. self.assertIsNone(parser.request_rate('*'))
  314. if __name__=='__main__':
  315. unittest.main()