# coding: utf-8 import urllib import csv import logging from grab.spider import Spider, Task class ExampleSpider(Spider): # , Spider # # initial initial_urls = ['http://habrahabr.ru/'] def prepare(self): # # prepare # self.result_file = csv.writer(open('result.txt', 'w')) # # , . self.result_counter = 0 def task_initial(self, grab, task): print 'Habrahabr home page' # initial # .. , # self.initial_urls # # Grab for elem in grab.xpath_list('//h1[@class="title"]/a[@class="post_title"]'): # - # habrapost # , # yield - # - : # self.add_task(Task('habrapost', url=...)) yield Task('habrapost', url=elem.get('href')) def task_habrapost(self, grab, task): print 'Habrahabr topic: %s' % task.url # , # , # , # # post = { 'url': task.url, 'title': grab.xpath_text('//h1/span[@class="post_title"]'), } # , , # Task . # , # . , # Task # query = urllib.quote_plus(post['title'].encode('utf-8')) search_url = 'http://images.yandex.ru/yandsearch?text=%s&rpt=image' % query yield Task('image_search', url=search_url, post=post) def task_image_search(self, grab, task): print 'Images search result for %s' % task.post['title'] # , # ! , # # . , # , `task.post`. image_url = grab.xpath_text('//div[@class="b-image"]/a/img/@src') yield Task('image', url=image_url, post=task.post) def task_image(self, grab, task): print 'Image downloaded for %s' % task.post['title'] # . # , . path = 'images/%s.jpg' % self.result_counter grab.response.save(path) self.result_file.writerow([ task.post['url'].encode('utf-8'), task.post['title'].encode('utf-8'), path ]) # , # self.result_counter += 1 if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) # - # , # , bot = ExampleSpider(thread_number=2) bot.run()
Source: https://habr.com/ru/post/142288/
All Articles