#==================< >==================
import urllib2
# HTTP,
import urllib
# HTTP, urllib2,
# - urllib.urlquote
from Queue import Queue
#, "Pool", ,
# ,
#
import threading
# ,
#threading.active_count, threading.Thread, threading.Thread.start,
#threading.Rlock
import re
# ,
#
import time
# , sleep
queue = Queue()
# , (..
# Queue Queue )
#==================</ >=================
#==============================<>==============================
PROXY = "10.10.31.103:3128"
# -,
# ,
# PROXY, -.
# None
HEADERS = { "User-Agent" : "Opera/9.64 (Windows NT 5.1; U; en) Presto/2.1.1" ,
"Accept" : "text/html, application/xml;q=0.9, application/xhtml+xml, image/ png, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1" ,
"Accept-Language" : "ru,uk-UA;q=0.9,uk;q=0.8,en;q=0.7" ,
"Accept-Charset" : "iso-8859-1, utf-8, utf-16, *;q=0.1" ,
"Accept-Encoding" : "identity, *;q=0" ,
"Connection" : "Keep-Alive" }
# www.google.com
# , HEADERS,
# Opera ,
# zlib compressed data, ..
# - ,
# ...
THREADS_COUNT = 10
# , -
DEEP = 30
# - , ,
# , ,
# .
ENCODING = "UTF-8"
# (
# )
#==============================</>===================================
LOCK = threading . RLock()
# threading
# LOCK, threading.RLock
# threading, - ,
#
#acquire() threading.RLock threading.Lock (
# threading) ,
#threading.RLock , threading.Lock
# .
#==================< >==================
import urllib2
# HTTP,
import urllib
# HTTP, urllib2,
# - urllib.urlquote
from Queue import Queue
#, "Pool", ,
# ,
#
import threading
# ,
#threading.active_count, threading.Thread, threading.Thread.start,
#threading.Rlock
import re
# ,
#
import time
# , sleep
queue = Queue()
# , (..
# Queue Queue )
#==================</ >=================
#==============================<>==============================
PROXY = "10.10.31.103:3128"
# -,
# ,
# PROXY, -.
# None
HEADERS = { "User-Agent" : "Opera/9.64 (Windows NT 5.1; U; en) Presto/2.1.1" ,
"Accept" : "text/html, application/xml;q=0.9, application/xhtml+xml, image/ png, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1" ,
"Accept-Language" : "ru,uk-UA;q=0.9,uk;q=0.8,en;q=0.7" ,
"Accept-Charset" : "iso-8859-1, utf-8, utf-16, *;q=0.1" ,
"Accept-Encoding" : "identity, *;q=0" ,
"Connection" : "Keep-Alive" }
# www.google.com
# , HEADERS,
# Opera ,
# zlib compressed data, ..
# - ,
# ...
THREADS_COUNT = 10
# , -
DEEP = 30
# - , ,
# , ,
# .
ENCODING = "UTF-8"
# (
# )
#==============================</>===================================
LOCK = threading . RLock()
# threading
# LOCK, threading.RLock
# threading, - ,
#
#acquire() threading.RLock threading.Lock (
# threading) ,
#threading.RLock , threading.Lock
# .
def worker ():
# worker,
global queue
#
# ,
# (!)
while True :
# ,
try :
# , try/except,
# QueueEmpty , ,
#
target_link = queue . get_nowait()
#
# queue
except Exception , error:
#
return
#
parsed_data = get_and_parse_page(target_link)
# ,
#
if parsed_data != "ERROR" :
# ,
write_to_file(parsed_data)
#
else :
queue . put(target_link)
# , queue
def worker ():
# worker,
global queue
#
# ,
# (!)
while True :
# ,
try :
# , try/except,
# QueueEmpty , ,
#
target_link = queue . get_nowait()
#
# queue
except Exception , error:
#
return
#
parsed_data = get_and_parse_page(target_link)
# ,
#
if parsed_data != "ERROR" :
# ,
write_to_file(parsed_data)
#
else :
queue . put(target_link)
# , queue
def write_to_file (parsed_data):
# write_to_file, β
global LOCK
global ENCODING
LOCK . acquire()
#" ",
#
with open ( "parsed_data.txt" , "a" ) as out:
# with statement, parsed_data.txt
# "a", ,
# out ( )
for site in parsed_data:
# parsed data,
# site
link, title = site[ 0 ], site[ 1 ]
# link title site
title = title . replace( "<em>" , "" ) . replace( "</em>" , "" ) . replace( "<b>" , "" ) . replace( "</b>" , "" )
#.replace - HTML-, title
out . write( u"{link}|{title} \n " . format(link = link, title = title) . encode( "cp1251" ))
# ,
# .format, % ,
# , :
# | title \n - (
# cp1251)
LOCK . release()
#"" ,
# . -,
# , ,
# ( )
# β β
# parsed_data.txt
def write_to_file (parsed_data):
# write_to_file, β
global LOCK
global ENCODING
LOCK . acquire()
#" ",
#
with open ( "parsed_data.txt" , "a" ) as out:
# with statement, parsed_data.txt
# "a", ,
# out ( )
for site in parsed_data:
# parsed data,
# site
link, title = site[ 0 ], site[ 1 ]
# link title site
title = title . replace( "<em>" , "" ) . replace( "</em>" , "" ) . replace( "<b>" , "" ) . replace( "</b>" , "" )
#.replace - HTML-, title
out . write( u"{link}|{title} \n " . format(link = link, title = title) . encode( "cp1251" ))
# ,
# .format, % ,
# , :
# | title \n - (
# cp1251)
LOCK . release()
#"" ,
# . -,
# , ,
# ( )
# β β
# parsed_data.txt
def get_and_parse_page (target_link):
# , β
global PROXY
# , PROXY
#
global HEADERS
# Headers
if PROXY is not None :
# PROXY None
proxy_handler = urllib2 . ProxyHandler( { "http" : "" + PROXY + "/" } )
# -
opener = urllib2 . build_opener(proxy_handler)
# opener c -
urllib2 . install_opener(opener)
# - ,
#, urllib2
#(
#PROXY)
page_request = urllib2 . Request(url = target_link, headers = HEADERS)
# Request, Request instance,
# GET ,
# ...
try :
# ,
#, ,
page = urllib2 . urlopen(url = page_request) . read() . decode( "UTF-8" , "replace" )
# page ,
# unicode UTF-8 (, www.google.com) (
#Python 2.6 unicode - (!))
except Exception ,error:
# error
print str (error)
# ,
#( )
return "ERROR"
# ,
harvested_data = re . findall( r'''\<li\ class\=g\>\<h3\ class\=r\>\<a\ href\=\"(.*?)".*?>(.*?)\<\/a\>\<\/h3\>''' , page)
# title
# , .
for data in harvested_data:
# harvested_data data
if data[ 0 ] . startswith( "/" ):
# data() /
harvested_data . remove(data)
# harvested_data
if ".google.com" in data[ 0 ]:
# data() .google.com
harvested_data . remove(data)
# harvested_data
return harvested_data
#
def get_and_parse_page (target_link):
# , β
global PROXY
# , PROXY
#
global HEADERS
# Headers
if PROXY is not None :
# PROXY None
proxy_handler = urllib2 . ProxyHandler( { "http" : "" + PROXY + "/" } )
# -
opener = urllib2 . build_opener(proxy_handler)
# opener c -
urllib2 . install_opener(opener)
# - ,
#, urllib2
#(
#PROXY)
page_request = urllib2 . Request(url = target_link, headers = HEADERS)
# Request, Request instance,
# GET ,
# ...
try :
# ,
#, ,
page = urllib2 . urlopen(url = page_request) . read() . decode( "UTF-8" , "replace" )
# page ,
# unicode UTF-8 (, www.google.com) (
#Python 2.6 unicode - (!))
except Exception ,error:
# error
print str (error)
# ,
#( )
return "ERROR"
# ,
harvested_data = re . findall( r'''\<li\ class\=g\>\<h3\ class\=r\>\<a\ href\=\"(.*?)".*?>(.*?)\<\/a\>\<\/h3\>''' , page)
# title
# , .
for data in harvested_data:
# harvested_data data
if data[ 0 ] . startswith( "/" ):
# data() /
harvested_data . remove(data)
# harvested_data
if ".google.com" in data[ 0 ]:
# data() .google.com
harvested_data . remove(data)
# harvested_data
return harvested_data
#
def get_and_parse_page (target_link):
# , β
global PROXY
# , PROXY
#
global HEADERS
# Headers
if PROXY is not None :
# PROXY None
proxy_handler = urllib2 . ProxyHandler( { "http" : "" + PROXY + "/" } )
# -
opener = urllib2 . build_opener(proxy_handler)
# opener c -
urllib2 . install_opener(opener)
# - ,
#, urllib2
#(
#PROXY)
page_request = urllib2 . Request(url = target_link, headers = HEADERS)
# Request, Request instance,
# GET ,
# ...
try :
# ,
#, ,
page = urllib2 . urlopen(url = page_request) . read() . decode( "UTF-8" , "replace" )
# page ,
# unicode UTF-8 (, www.google.com) (
#Python 2.6 unicode - (!))
except Exception ,error:
# error
print str (error)
# ,
#( )
return "ERROR"
# ,
harvested_data = re . findall( r'''\<li\ class\=g\>\<h3\ class\=r\>\<a\ href\=\"(.*?)".*?>(.*?)\<\/a\>\<\/h3\>''' , page)
# title
# , .
for data in harvested_data:
# harvested_data data
if data[ 0 ] . startswith( "/" ):
# data() /
harvested_data . remove(data)
# harvested_data
if ".google.com" in data[ 0 ]:
# data() .google.com
harvested_data . remove(data)
# harvested_data
return harvested_data
#
def main ():
# ,
print "STARTED"
#
global THREADS_COUNT
global DEEP
global ENCODING
#
#
with open ( "requests.txt" ) as requests:
# requests
for request in requests:
# ,
# , ,
# , :)
request = request . translate( None , " \r\n " ) . decode(ENCODING, "replace" )
#
# ( )
empty_link = "www.google.com/search?hl=ru&client=opera&rls=ru&hs=67v&q={request}&start={N}&sa=N"
# ,
for i in xrange ( 0 , DEEP, 10 ):
# # 0 DEEP,
#
# 10, ..
# , .. 10, 20, 30 ( )
queue . put(empty_link . format(request = request . encode( "UTF-8" ), N = i))
#
# UTF-8 ( )
for _ in xrange (THREADS_COUNT):
#
thread_ = threading . Thread(target = worker)
# , target- ,
# ,
thread_ . start()
# start() ,
while threading . active_count() >1 :
# , 1 (,
# )
time . sleep( 1 )
# 1
print "FINISHED"
#
def main ():
# ,
print "STARTED"
#
global THREADS_COUNT
global DEEP
global ENCODING
#
#
with open ( "requests.txt" ) as requests:
# requests
for request in requests:
# ,
# , ,
# , :)
request = request . translate( None , " \r\n " ) . decode(ENCODING, "replace" )
#
# ( )
empty_link = "www.google.com/search?hl=ru&client=opera&rls=ru&hs=67v&q={request}&start={N}&sa=N"
# ,
for i in xrange ( 0 , DEEP, 10 ):
# # 0 DEEP,
#
# 10, ..
# , .. 10, 20, 30 ( )
queue . put(empty_link . format(request = request . encode( "UTF-8" ), N = i))
#
# UTF-8 ( )
for _ in xrange (THREADS_COUNT):
#
thread_ = threading . Thread(target = worker)
# , target- ,
# ,
thread_ . start()
# start() ,
while threading . active_count() >1 :
# , 1 (,
# )
time . sleep( 1 )
# 1
print "FINISHED"
#
def main ():
# ,
print "STARTED"
#
global THREADS_COUNT
global DEEP
global ENCODING
#
#
with open ( "requests.txt" ) as requests:
# requests
for request in requests:
# ,
# , ,
# , :)
request = request . translate( None , " \r\n " ) . decode(ENCODING, "replace" )
#
# ( )
empty_link = "www.google.com/search?hl=ru&client=opera&rls=ru&hs=67v&q={request}&start={N}&sa=N"
# ,
for i in xrange ( 0 , DEEP, 10 ):
# # 0 DEEP,
#
# 10, ..
# , .. 10, 20, 30 ( )
queue . put(empty_link . format(request = request . encode( "UTF-8" ), N = i))
#
# UTF-8 ( )
for _ in xrange (THREADS_COUNT):
#
thread_ = threading . Thread(target = worker)
# , target- ,
# ,
thread_ . start()
# start() ,
while threading . active_count() >1 :
# , 1 (,
# )
time . sleep( 1 )
# 1
print "FINISHED"
#
Source: https://habr.com/ru/post/78267/
All Articles