\ spiders
\ spiders \ __ init__.py
\ spiders \ abiturlist.py
\ spiders \ SpecSpider.py
__init__.py
items.py
pipelines.py
settings.py
from scrapy.item import Item, Field class SpecItem(Item): spec = Field() SpecName = Field() class GtudataItem(Item): family = Field() name = Field() surname = Field() spec = Field() ball = Field() url = Field() pagespec = Field()
from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy.loader.processors import TakeFirst, Identity from scrapy.loader import ItemLoader from scrapy.selector import HtmlXPathSelector, Selector from gtudata.items import GtudataItem class AbiturLoader(ItemLoader): default_output_processor = Identity() class AbiturlistSpider(CrawlSpider): name = "abiturlist" allowed_domains = ["oreluniver.ru"] start_urls = ["http://oreluniver.ru/abits?src=all_postupil"] rules = ( Rule(LinkExtractor(allow=('spec_id=04.03.01')), callback='parse_item'), ) def parse_item(self, response): hxs = Selector(response) all = hxs.xpath("//tr[position()>1]") pg_spec = hxs.xpath("//div[@class='page-content']/b/div/text()").extract()[0].strip() for fld in all: Item = GtudataItem() FIO = fld.xpath("./td[2]/p/text()").extract()[0].split() Item['family'] = FIO[0] Item['name'] = FIO[1] Item['surname'] = FIO[2] Item['spec'] = fld.xpath("./td[last()]/p/text()").extract()[0] ball = fld.xpath("string(./td[3]/p)").extract()[0] Item['ball'] = ball Item['url'] = response.url Item['pagespec'] = pg_spec yield Item
from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy.selector import HtmlXPathSelector, Selector from gtudata.items import SpecItem class SpecSpider(CrawlSpider): name = "speclist" allowed_domains = ["oreluniver.ru"] start_urls = ["http://oreluniver.ru/abits?src=all_postupil"] # /abits?src=all_postupil rules = ( Rule(LinkExtractor(allow = ('src=all_postupil')), callback='parse_item'), ) def parse_item(self, response): hxs = Selector(response) all = hxs.xpath('//a[contains(@href, "spec_id")]/text()').extract() # print 'test' for fld in all: txt = fld.strip() Item = SpecItem() Item['SpecName'] = txt[9:] Item['spec'] = txt[:8] yield Item
id = Column(Integer, primary_key=True)
spec = Column(String)
Base.metadata.create_all(self.engine)
dt = DataTable(item['family'],item['name'], item['surname'], item['spec'], item['ball'], item['url'], item['pagespec']) dt = SpecTable(item['spec'],item['SpecName'])
fio = item['family'] + item['name'] + item['surname']
if fio not in self.fio: dt = DataTable(item['family'],item['name'], item['surname'], item['spec'], item['ball'], item['url'], item['pagespec']) self.fio.add(fio) self.session.add(dt)
self.session.add(dt)
def open_spider(self, spider): self.session = Session(bind=self.engine)
def close_spider(self, spider): self.session.commit() self.session.close()
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, ForeignKey from sqlalchemy.orm import Session import os from gtudata.items import SpecItem, GtudataItem from scrapy.exceptions import DropItem Base = declarative_base() class SpecTable(Base): __tablename__ = 'specdata' id = Column(Integer, primary_key=True) spec = Column(String) spectitle = Column(String) def __init__(self, spec, spectitle): self.spec= spec self.spectitle = spectitle def __repr__(self): return "<Data %s, %s>" % (self.spec, self.spectitle) class DataTable(Base): __tablename__ = 'gtudata' id = Column(Integer, primary_key=True) family = Column(String) name = Column(String) surname = Column(String) spec = Column(String) ball = Column(Integer) url = Column(String) pagespec = Column(String) def __init__(self, family, name, surname, spec, ball, url, pagespec): self.family = family self.name = name self.surname = surname self.spec = spec self.ball = ball self.url = url self.pagespec = pagespec def __repr__(self): return "<Data %s, %s, %s, %s, %s, %s, %s>" % \ (self.family, self.name, self.surname, self.spec, self.ball, self.url, self.pagespec) class GtudataPipeline(object): def __init__(self): basename = 'data_scraped' self.engine = create_engine("sqlite:///%s" % basename, echo=False) if not os.path.exists(basename): Base.metadata.create_all(self.engine) self.fio = set() def process_item(self, item, spider): if isinstance(item, GtudataItem): fio = item['family'] + item['name'] + item['surname'] if fio not in self.fio: dt = DataTable(item['family'],item['name'], item['surname'], item['spec'], item['ball'], item['url'], item['pagespec']) self.fio.add(fio) self.session.add(dt) elif isinstance(item, SpecItem): dt = SpecTable(item['spec'],item['SpecName']) self.session.add(dt) return item def close_spider(self, spider): self.session.commit() self.session.close() def open_spider(self, spider): self.session = Session(bind=self.engine)
scrapy crawl speclist scrapy crawl abiturlist
Source: https://habr.com/ru/post/308660/
All Articles