import requests # HTTP- import urllib.request # HTTP from lxml import html # xml html, html import re # from bs4 import BeautifulSoup # HTML import csv # CSV import tkinter # from tkinter.filedialog import * #
global proxy1 # proxy1 = '' # BASE_URL = 'https://www.weblancer.net/jobs/' # massiv = [] #
root = Tk() # root.geometry('850x500') # txt1 = Text(root, width = 18, heigh = 2) # txt2 = Text(root, width = 60, heigh = 22) # lbl4 = Label(root, text = '') # btn1 = Button(root, text = ' ') # btn2 = Button(root, text = ' ') # btn3 = Button(root, text = ' ') # lbl1 = Label(root, text = ' ') # lbl2 = Label(root, text = '') # lbl3 = Label(root, text = '') #
btn1.bind('<Button-1>', main) # btn2.bind('<Button-1>', poisk) # btn3.bind('<Button-1>', delete) # lbl2.grid(row = 4, column = 1) lbl4.grid(row = 5, column = 1) lbl3.grid(row = 3, column = 1) btn1.grid(row = 1, column = 1) btn3.grid(row = 2, column = 1) btn2.grid(row = 1, column = 2) lbl1.grid(row = 2, column = 2) txt1.grid(row = 3, column = 2) txt2.grid(row = 6, column = 3) root.mainloop() #
def main(event): # event ( ) page_count = get_page_count(get_html(BASE_URL)) # , , http- BASE_URL lbl3.config(text=' : '+str(page_count)) # lbl3 page = 1 # projects = [] # while page_count != page: # , page proxy = Proxy() # , proxy = proxy.get_proxy() # proxy- lbl4.update() # lbl4.config(text=': '+proxy) # global proxy1 # proxy1 = proxy # try: # for i in range(1,10): # (range - , ). , page += 1 # lbl2.update() # lbl2.config(text=' %d%%'%(page / page_count * 100)) # 100% r = requests.get(BASE_URL + '?page=%d' % page, proxies={'https': proxy}) # parsing = BeautifulSoup(r.content, "lxml") # html- BeautifulSoup ( ) projects.extend(parse(BASE_URL + '?page=%d' % page, parsing)) # parse ( html-) save(projects, 'proj.csv') # csv, projects except requests.exceptions.ProxyError: # continue # while except requests.exceptions.ConnectionError: # continue # while except requests.exceptions.ChunkedEncodingError: # , continue # while
def get_html(url): # url, page_count[count] response = urllib.request.urlopen(url) # «» httplib, , return response.read() # read
def get_page_count(html): # html soup = BeautifulSoup(html, 'html.parser') # html- url , paggination = soup('ul')[3:4] # , lis = [li for ul in paggination for li in ul.findAll('li')][-1] # lis, for link in lis.find_all('a'): # var1 = (link.get('href')) # var2 = var1[-3:] # , return int(var2) #
class Proxy: # proxy_url = 'http://www.ip-adress.com/proxy_list/' # , - proxy_list = [] # def __init__(self): # self r = requests.get(self.proxy_url) #http- get, url str = html.fromstring(r.content) # lxml.html.HtmlElement result = str.xpath("//tr[@class='odd']/td[1]/text()") # for i in result: # if i in massiv: # yy = result.index(i) # result del result[yy] # result self.list = result # def get_proxy(self): # self for proxy in self.list: # if 'https://'+proxy == proxy1: #, , : global massiv #massiv massiv = massiv + [proxy] # url = 'https://'+proxy # return url #
def parse(html,parsing): # html parsing projects = [] # , table = parsing.find('div' , {'class' : 'container-fluid cols_table show_visited'}) # html-, , , , , for row in table.find_all('div' , {'class' : 'row'}): # cols = row.find_all('div') # price = row.find_all('div' , {'class' : 'col-sm-1 amount title'}) # cols1 = row.find_all('div' , {'class' : 'col-xs-12' , 'style' : 'margin-top: -10px; margin-bottom: -10px'}) # if cols1==[]: # , application_text = '' # else: # application_text = cols1[0].text # html- cols2 = [category.text for category in row.find_all('a' , {'class' : 'text-muted'})] # projects.append({'title': cols[0].a.text, 'category' : cols2[0], 'applications' : cols[2].text.strip(), 'price' : price[0].text.strip() , 'description' : application_text}) # projects return projects #
def delete(event): # txt1.delete(1.0, END) # txt2.delete(1.0, END) #
def poisk(event): # event file = open("proj.csv", "r") # , rdr = csv.DictReader(file, fieldnames = ['name', 'categori', 'zajavki', 'case', 'opisanie']) # poisk = txt1.get(1.0, END) # poisk = poisk[0:len(r)-1] # , ('\n') for rec in rdr: # , csv- data = rec['opisanie'].split(';') # data1 = rec['case'].split(';') # data = ('').join(data) # data1 = ('').join(data1) # w = re.findall(poisk, data) # if w != []: #, w , if data1 == '': # , , data1 = '' # txt2.insert(END, data+'--'+data1+'\n'+'---------------'+'\n') # , , , ,
def save(projects, path): # path with open(path, 'w') as csvfile: # path w ( . . _, ) writer = csv.writer(csvfile) #writer - , csv - writer.writerow(('', '', '' , '' , '')) #writerow - for project in projects: # try: # writer.writerow((project['title'], project['category'], project['applications'], project['price'], project['description'])) # except UnicodeEncodeError: # description , writer.writerow((project['title'], project['category'], project['applications'], project['price'], '')) #
Source: https://habr.com/ru/post/322608/
All Articles