Scrapy: inserting data into the MySQL database
I am experincing error : spider has no object _getitem_
Here is my Pipeline code
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import sys
import MySQLdb
import MySQLdb.cursors
from scrapy import log
class OnthegoPipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db='scrapy',
host='127.0.0.1',
user='root',
passwd='',
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True,
)
def process_item(self, spider, item):
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
tx.execute(\
"insert into links (link) "
"values (%s)",
(item['enam'])
)
def handle_error(self, e):
log.err(e)
and this is my Spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from onthego.items import OnthegoItem
import sys
import MySQLdb
import MySQLdb.cursors
from MySQLdb import escape_string
import urlparse
import _mysql
# defining function to get the full craped links
def complete_url(string):
"""Return complete url"""
return "http://www.timeoutdelhi.net" + string
#return url.encode('utf8')
#return urlparse.urljoin("http://www.timeoutdelhi.net", string)
class GoSpider(BaseSpider):
name = "title"
allowed_domains = ["timeoutdelhi.net"]
start_urls = [
"http://www.timeoutdelhi.net/"
]
#rules = [Rule(SgmlLinkExtractor(allow=['a\d+\.htm$']), 'parse_item'),
#Rule(SgmlLinkExtractor(restrict_xpaths='//td[@class="pgs"]'),)]
def parse(self, response):
hxs=HtmlXPathSelector(response)
sites=hxs.select('//ul[@class="menu"]/li[@class="last leaf events
events"]/a/@href')
sites1=hxs.select('//ul[@class="menu"]/li[@class="last leaf
sales-exhibitions sales-exhibitions"]/a/@href')
items=[]
for site in sites:
link = site.extract()
yield Request(complete_url(link), callback=self.parse_category)
for sit in sites1:
link1 = sit.extract()
yield Request(complete_url(link1), callback=self.parse_category)
def parse_category(self, response):
hxs = HtmlXPathSelector(response)
# HXS to Detail link inside td and a
sites =
hxs.select('//div[@class="left-column"]/div[@class="resultContainer1"]/span[@class="field-content"]/h2/a/@href')
sites2 =
hxs.select('//div[@class="left-column"]/div[@class="resultContainer1"]/h2/a/@href')
sites1 =
hxs.select('//div[@class="left-column"]/div[@class="resultContainer"]/span/h2/a/@href')
items=[]
for sit in sites2:
link=sit.extract()
yield Request(complete_url(link),
callback=self.parse_category_tilte)
for site in sites:
link1=site.extract()
yield Request(complete_url(link1),
callback=self.parse_category_tilte)
for site in sites1:
link1=site.extract()
yield Request(complete_url(link1),
callback=self.parse_category_tilte)
def parse_category_tilte(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[@class="box-header"]/h3/text()')
items=[]
for site in sites:
item=OnthegoItem()
item['ename']=site.extract()
items.append(item)
return items
I've tried to change the variable names and even the list. I think it is
unable to reach the item['ename'] in the pipeline . Please help.
No comments:
Post a Comment