Escribí un rastreador en python usando la herramienta scrapy de python. El siguiente es el código Python:Scrapy Crawler en python no puede seguir los enlaces?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
#from scrapy.item import Item
from a11ypi.items import AYpiItem
class AYpiSpider(CrawlSpider):
name = "AYpi"
allowed_domains = ["a11y.in"]
start_urls = ["http://a11y.in/a11ypi/idea/firesafety.html"]
rules =(
Rule(SgmlLinkExtractor(allow =()) ,callback = 'parse_item')
)
def parse_item(self,response):
#filename = response.url.split("/")[-1]
#open(filename,'wb').write(response.body)
#testing codes^(the above)
hxs = HtmlXPathSelector(response)
item = AYpiItem()
item["foruri"] = hxs.select("//@foruri").extract()
item["thisurl"] = response.url
item["thisid"] = hxs.select("//@foruri/../@id").extract()
item["rec"] = hxs.select("//@foruri/../@rec").extract()
return item
Pero, en lugar de seguir los vínculos que el error que se produce es:
Traceback (most recent call last):
File "/usr/lib/python2.6/site-packages/Scrapy-0.12.0.2538-py2.6.egg/scrapy/cmdline.py", line 131, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/python2.6/site-packages/Scrapy-0.12.0.2538-py2.6.egg/scrapy/cmdline.py", line 97, in _run_print_help
func(*a, **kw)
File "/usr/lib/python2.6/site-packages/Scrapy-0.12.0.2538-py2.6.egg/scrapy/cmdline.py", line 138, in _run_command
cmd.run(args, opts)
File "/usr/lib/python2.6/site-packages/Scrapy-0.12.0.2538-py2.6.egg/scrapy/commands/crawl.py", line 45, in run
q.append_spider_name(name, **opts.spargs)
--- <exception caught here> ---
File "/usr/lib/python2.6/site-packages/Scrapy-0.12.0.2538-py2.6.egg/scrapy/queue.py", line 89, in append_spider_name
spider = self._spiders.create(name, **spider_kwargs)
File "/usr/lib/python2.6/site-packages/Scrapy-0.12.0.2538-py2.6.egg/scrapy/spidermanager.py", line 36, in create
return self._spiders[spider_name](**spider_kwargs)
File "/usr/lib/python2.6/site-packages/Scrapy-0.12.0.2538-py2.6.egg/scrapy/contrib/spiders/crawl.py", line 38, in __init__
self._compile_rules()
File "/usr/lib/python2.6/site-packages/Scrapy-0.12.0.2538-py2.6.egg/scrapy/contrib/spiders/crawl.py", line 82, in _compile_rules
self._rules = [copy.copy(r) for r in self.rules]
exceptions.TypeError: 'Rule' object is not iterable
Puede alguien por favor me explique lo que está pasando? Dado que esto es lo mencionado en la documentación y dejo el campo de permiso en blanco, eso mismo debería hacer que siga True por defecto. Entonces, ¿por qué el error? ¿Qué tipo de optimizaciones puedo hacer con mi rastreador para que sea rápido?
corrigió mi problema, gracias. – alex