import sys import random import re import os from itertools import cycle, chain from types import ListType, TupleType from pprint import pprint from cStringIO import StringIO from hashlib import md5 from twisted.python.log import msg, err from twisted.internet.defer import Deferred, DeferredQueue, inlineCallbacks, returnValue from core.queues import httpQueue from core.utils import sleep, get_best_resolution from core.constants import DEBUG from core.models import ImageModel, contextModel from core.html import normalize, normalize_title from base import sources, BaseIndexer class CarwallsIndexer(BaseIndexer): """CarwallsIndexer class""" name = 'Carwalls.com' charset = 'utf8' index = 'http://www.desktopmachine.com/' source = sources.add(4) pages = cycle(chain(*( ('?p={0}'.format(page) for page in xrange(0, 500, 18)), ))) reFindImagesList = re.compile(u'<a href=([\S]+framepic\.php\?id=\d+&size=[\S]+)[^>]+>2560\s*x\s*1600<\/a>', re.S).findall reFindTitle = re.compile(u"<title>(.+?)2560\s*x\s*1600 wallpaper<\/title>", re.S).search reFindPhoto = re.compile(u'<td colspan=2>\s*<img src=([\S]+\/pics\/[\S]+2560x1600\.(?:jpg))>\s*<\/td>', re.S).search @inlineCallbacks def _findImages(self): self._stats.page = self.pages.next() # Request result = yield httpQueue.request(url=self.getAbsoluteUrl(self._stats.page)) result = result.decode(self.charset, 'ignore') if not result: raise ValueError('Wow! Empty result') # Count images count = 0 for url in self.reFindImagesList(result): # Sleep (yield self.sleepWithFireOnServiceStop(self.sleepValue, self.sleepSplit)) # Try find images msg('Spider', self.name, 'findImages, try', url) if self.loop == -1: returnValue(None) try: result = yield httpQueue.request(url=self.getAbsoluteUrl(url)) result = result.decode(self.charset, 'ignore') except Exception, e: msg('Spider', self.name, 'findImages request error', url) err(e) # Stats self._stats.errors.http += 1 # Skip continue title = self.reFindTitle(result) image = self.reFindPhoto(result) title = title and title.group(1) or None image = image and image.group(1) or None if not title or not image: msg('Spider', self.name, 'findImages wrong title or image', repr((title, image))) # Skip continue # Make item try: item = (yield self._makeItem(title=title, url=url.split('&size').pop(0))) except Exception, e: msg('Spider', self.name, 'findImages make item error') err(e) # Skip continue url = image if not item['url']: msg('Spider', self.name, 'findImages wrong url', repr(item['url'])) # Skip continue if not item['categories']: # Set default categories item['categories'].extend((103, 112)) # Translate to list item['categories_names'] = list( item['categories_names']) # Sleep (yield self.sleepWithFireOnServiceStop(self.sleepValue, self.sleepSplit)) if self.loop == -1: returnValue(None) msg('Spider', self.name, 'findImages, try', url) # Create file result = self._makeFile() try: (yield httpQueue.request(url=self.getAbsoluteUrl(url), file=result)) except Exception, e: msg('Spider', self.name, 'findImages request error', url) err(e) if hasattr(result, 'delete') and not result.delete: # Delete file if is temporary os.unlink(result.name) # Stats self._stats.errors.http += 1 # Skip continue finally: result.close() try: item.update(image=result) # if DEBUG: # pprint(item) self.imageQueuePut(item) except Exception, e: msg('Spider', self.name, 'findImages create error') err(e) # Skip continue returnValue(count)
from twisted.application.service import Application, MultiService from core import constants from sources.carwalls import CarwallsIndexer from sources.bikewalls import BikewallsIndexer application = Application("ARX-Images Indexers") services = MultiService() services.setServiceParent(application) services.addService(CarwallsIndexer()) services.addService(BikewallsIndexer())
def getImageHash(image): cdef unsigned int lefts, row, i cdef unsigned long long bits cdef list results = [] if not isinstance(image, Image.Image): image = Image.open(image) image = image.resize((128, 128)) image = image.filter(ImageFilter.Kernel( (5, 5), ( 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1 ), 8, 0 )) image = ImageOps.grayscale(image) image = image.resize((16, 16)).convert('P', dither=Image.NONE) lefts = (sum(image.getdata()) / HASH_BITS) datas = image.getdata() for i in xrange(0, 256, 32): bits = int(''.join('1' if row > lefts else '0' for row in islice(datas, i, i + 32)), 2) # Add to results results.append(bits) return tuple(results)
Source: https://habr.com/ru/post/147829/
All Articles