[3.7] bpo-32861: urllib.robotparser fix incomplete __str__ methods. (… · python/cpython@c3fa1f2

3 files changed

lines changed

Original file line numberDiff line numberDiff line change

@@ -246,6 +246,33 @@ class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):

246246

bad = ['/cyberworld/map/index.html']

247247
248248
249+

class StringFormattingTest(BaseRobotTest, unittest.TestCase):

250+

robots_txt = """\

251+

User-agent: *

252+

Crawl-delay: 1

253+

Request-rate: 3/15

254+

Disallow: /cyberworld/map/ # This is an infinite virtual URL space

255+
256+

# Cybermapper knows where to go.

257+

User-agent: cybermapper

258+

Disallow: /some/path

259+

"""

260+
261+

expected_output = """\

262+

User-agent: cybermapper

263+

Disallow: /some/path

264+
265+

User-agent: *

266+

Crawl-delay: 1

267+

Request-rate: 3/15

268+

Disallow: /cyberworld/map/

269+
270+

"""

271+
272+

def test_string_formatting(self):

273+

self.assertEqual(str(self.parser), self.expected_output)

274+
275+
249276

class RobotHandler(BaseHTTPRequestHandler):

250277
251278

def do_GET(self):

Original file line numberDiff line numberDiff line change

@@ -190,7 +190,10 @@ def request_rate(self, useragent):

190190

return self.default_entry.req_rate

191191
192192

def __str__(self):

193-

return ''.join([str(entry) + "\n" for entry in self.entries])

193+

entries = self.entries

194+

if self.default_entry is not None:

195+

entries = entries + [self.default_entry]

196+

return '\n'.join(map(str, entries)) + '\n'

194197
195198
196199

class RuleLine:

@@ -222,10 +225,15 @@ def __init__(self):

222225

def __str__(self):

223226

ret = []

224227

for agent in self.useragents:

225-

ret.extend(["User-agent: ", agent, "\n"])

226-

for line in self.rulelines:

227-

ret.extend([str(line), "\n"])

228-

return ''.join(ret)

228+

ret.append(f"User-agent: {agent}")

229+

if self.delay is not None:

230+

ret.append(f"Crawl-delay: {self.delay}")

231+

if self.req_rate is not None:

232+

rate = self.req_rate

233+

ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")

234+

ret.extend(map(str, self.rulelines))

235+

ret.append('') # for compatibility

236+

return '\n'.join(ret)

229237
230238

def applies_to(self, useragent):

231239

"""check if this entry applies to the specified agent"""

Original file line numberDiff line numberDiff line change

@@ -0,0 +1,3 @@

1+

The urllib.robotparser's ``__str__`` representation now includes wildcard

2+

entries and the "Crawl-delay" and "Request-rate" fields. Patch by

3+

Michael Lazar.