import pycurl c = pycurl.Curl()
url = 'ya.ru' c.setopt(pycurl.URL, url)
from StringIO import StringIO c.bodyio = StringIO() c.setopt(pycurl.WRITEFUNCTION, c.bodyio.write) c.get_body = c.bodyio.getvalue
c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.CONNECTTIMEOUT, 60) c.setopt(pycurl.TIMEOUT, 120) c.setopt(pycurl.NOSIGNAL, 1) c.setopt(pycurl.USERAGENT, 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:13.0) Gecko/20100101 Firefox/13.0') httpheader = [ 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language: ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Charset:utf-8;q=0.7,*;q=0.5', 'Connection: keep-alive', ] c.setopt(pycurl.HTTPHEADER, httpheader)
c.perform()
print c.get_body()
print c.getinfo(pycurl.HTTP_CODE)
if c.getinfo(pycurl.HTTP_CODE) != 200: raise Exception('HTTP code is %s' % c.getinfo(pycurl.HTTP_CODE))
import pycurl try: from cStringIO import StringIO except ImportError: from StringIO import StringIO def get_page(url, *args, **kargs): c = pycurl.Curl() c.setopt(pycurl.URL, url) c.bodyio = StringIO() c.setopt(pycurl.WRITEFUNCTION, c.bodyio.write) c.get_body = c.bodyio.getvalue c.headio = StringIO() c.setopt(pycurl.HEADERFUNCTION, c.headio.write) c.get_head = c.headio.getvalue c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.CONNECTTIMEOUT, 60) c.setopt(pycurl.TIMEOUT, 120) c.setopt(pycurl.NOSIGNAL, 1) c.setopt(pycurl.USERAGENT, 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:13.0) Gecko/20100101 Firefox/13.0') httpheader = [ 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language: ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Charset:utf-8;q=0.7,*;q=0.5', 'Connection: keep-alive', ] c.setopt(pycurl.HTTPHEADER, httpheader) c.perform() if c.getinfo(pycurl.HTTP_CODE) != 200: raise Exception('HTTP code is %s' % c.getinfo(pycurl.HTTP_CODE)) return c.get_body()
print get_page('ya.ru')
import urllib import urlparse key='' region=213 page=1 params = ['http', 'yandex.ru', '/yandsearch', '', '', ''] params[4] = urllib.urlencode({ 'text':key, 'lr':region, 'p':page-1, }) url = urlparse.urlunparse(params)
print url
html = get_page(url)
import lxml.html site_list = [] for h2 in lxml.html.fromstring(html).find_class('b-serp-item__title'): b = h2.find_class('b-serp-item__number') if len(b): num = b[0].text.strip() url = h2.find_class('b-serp-item__title-link')[0].attrib['href'] site = urlparse.urlparse(url).hostname site_list.append((num, site, url))
print site_list
def site_list(key, region=213, page=1): params = ['http', 'yandex.ru', '/yandsearch', '', '', ''] params[4] = urllib.urlencode({ 'text':key, 'lr':region, 'p':page-1, }) url = urlparse.urlunparse(params) html = get_page(url) site_list = [] for h2 in lxml.html.fromstring(html).find_class('b-serp-item__title'): b = h2.find_class('b-serp-item__number') if len(b): num = b[0].text.strip() url = h2.find_class('b-serp-item__title-link')[0].attrib['href'] site = urlparse.urlparse(url).hostname site_list.append((num, site, url)) return site_list
print site_list('', 213, 2)
def cut_www(site): if site.startswith('www.'): site = site[4:] return site
site = 'habrahabr.ru' for pos, s, url in site_list('python', 213, 1): if cut_www(s) == site: print pos, url
def site_position(site, key, region=213, max_position=10): for page in range(1,int(math.ceil(max_position/10.0))+1): site = cut_www(site) for pos, s, url in site_list(key, region, page): if cut_www(s) == site: return pos, url return None, None
print site_position('habrahabr.ru', 'python', 213, 100)
Source: https://habr.com/ru/post/146258/
All Articles