def get_list_id_vacancies(area, text): url_list = 'https://api.hh.ru/vacancies' list_id = [] params = {'text': text, 'area': area} r = requests.get(url_list, params=params) found = json.loads(r.text)['found']; #- if found <= 500: # API 500 ( ). 500 . params['per_page'] = found r = requests.get(url_list, params=params) data = json.loads(r.text)['items'] for vac in data: list_id.append(vac['id']) else: i = 0; while i <= 3: # 500 "" 0 3 . API 2000 , 3. params['per_page'] = 500 params['page'] = i r = requests.get(url_list, params=params) if 200 != r.status_code: break data = json.loads(r.text)['items'] for vac in data: list_id.append(vac['id']) i += 1 return list_id
def get_vacancy(id): url_vac = 'https://api.hh.ru/vacancies/%s' r = requests.get(url_vac % id) return json.loads(r.text)
{ "alternate_url": "https://hh.ru/vacancy/22285538", "code": null, "premium": false, "description": "<p> ....", "schedule": { "id": "fullDay", "name": " " }, "suitable_resumes_url": null, "site": { "id": "hh", "name": "hh.ru" }, "billing_type": { "id": "standard_plus", "name": "+" }, "published_at": "2017-09-05T11:43:08+0300", "test": null, "accept_handicapped": true, "experience": { "id": "noExperience", "name": " " }, "address": { "building": "367", "city": "", "description": null, "metro": { "line_name": "", "station_id": "8.470", "line_id": "8", "lat": 55.736478, "station_name": " ", "lng": 37.514401 }, "metro_stations": [ { "line_name": "", "station_id": "8.470", "line_id": "8", "lat": 55.736478, "station_name": " ", "lng": 37.514401 } ], "raw": null, "street": " ", "lat": 55.739068, "lng": 37.525432 }, "key_skills": [ { "name": " " }, { "name": " " } ], "allow_messages": true, "employment": { "id": "full", "name": " " }, "id": "22285538", "response_url": null, "salary": { "to": 90000, "gross": false, "from": 50000, "currency": "RUR" }, "archived": false, "name": "/ Data scientist", "contacts": null, "employer": { "logo_urls": { "90": "https://hhcdn.ru/employer-logo/1680554.png", "240": "https://hhcdn.ru/employer-logo/1680555.png", "original": "https://hhcdn.ru/employer-logo-original/309546.png" }, "vacancies_url": "https://api.hh.ru/vacancies?employer_id=1475513", "name": " ", "url": "https://api.hh.ru/employers/1475513", "alternate_url": "https://hh.ru/employer/1475513", "id": "1475513", "trusted": true }, "created_at": "2017-09-05T11:43:08+0300", "area": { "url": "https://api.hh.ru/areas/1", "id": "1", "name": "" }, "relations": [], "accept_kids": false, "response_letter_required": false, "apply_alternate_url": "https://hh.ru/applicant/vacancy_response?vacancyId=22285538", "quick_responses_allowed": false, "negotiations_url": null, "department": null, "branded_description": null, "hidden": false, "type": { "id": "open", "name": "" }, "specializations": [ { "profarea_id": "14", "profarea_name": ", ", "id": "14.91", "name": ", " }, { "profarea_id": "14", "profarea_name": ", ", "id": "14.141", "name": "" }] }
{ "description": "<p> ....", "schedule": { "id": "fullDay", "name": " " }, "accept_handicapped": true, "experience": { "id": "noExperience", "name": " " }, "key_skills": [ { "name": " " }, { "name": " " } ], "employment": { "id": "full", "name": " " }, "id": "22285538", "salary": { "to": 90000, "gross": false, "from": 50000, "currency": "RUR" }, "name": "/ Data scientist", "employer": { "name": " ", }, "area": { "name": "" }, "specializations": [ { "profarea_id": "14", "profarea_name": ", ", "id": "14.91", "name": ", " }, { "profarea_id": "14", "profarea_name": ", ", "id": "14.141", "name": "" }] }
def get_salary(vac): # . , , None, . if vac['salary'] is None: return {'currency':None , 'from':None,'to':None,'gross':None} else: return {'currency':vac['salary']['currency'], 'from':vac['salary']['from'], 'to':vac['salary']['to'], 'gross':vac['salary']['gross']} def get_connection(): conn = pymysql.connect(host='localhost', port=3306, user='root', password='-', db='hh', charset="utf8") return conn def close_connection(conn): conn.commit() conn.close() def insert_vac(conn, vac, text): a = conn.cursor() salary = get_salary(vac) print(vac['id']) a.execute("INSERT INTO vacancies (id, name_v, description, code_hh, accept_handicapped, \ area_v, employer, employment, experience, salary_currency, salary_from, salary_gross, \ salary_to, schedule_d, text_search) \ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (vac['id'], vac['name'], vac['description'], vac['code'], vac['accept_handicapped'], vac['area']['name'], vac['employer']['name'], vac['employment']['name'], vac['experience']['name'], salary['currency'], salary['from'], salary['gross'], salary['to'], vac['schedule']['name'], text)) for key_skill in vac['key_skills']: a.execute("INSERT INTO key_skills(vacancy_id, name) VALUES(%s, %s)",(vac['id'], key_skill['name'])) for spec in vac['specializations']: a.execute("INSERT INTO specializations(vacancy_id, name, profarea_name) VALUES(%s, %s, %s)", (vac['id'], spec['name'], spec['profarea_name'])) a.close()
text_search = 'data scientist' list_id_vacs = get_list_id_vacancies(text_search) vacs = [] for vac_id in list_id_vacs: vacs.append(get_vacancy(vac_id)) conn = get_connection() for vac in vacs: insert_vac(conn, vac, text_search) close_connection(conn)
def get_vac_descriptions(conn, text_search): a = conn.cursor() a.execute("SELECT description FROM vacancies WHERE text_search = %s", text_search) descriptions = a.fetchall() a.close return descriptions
def get_popular_phrase(text, len, count_phrases): phrase_counter = Counter() words = nltk.word_tokenize(text.lower()) for phrase in nltk.ngrams(words, len): if all(word not in string.punctuation for word in phrase): phrase_counter[phrase] += 1 return phrase_counter.most_common(count_phrases) descriptions = get_vac_descriptions(get_connection(), 'data scientist') text = '' for description in descriptions: text = text + description[0] result = get_popular_phrase(text, 1, 20) for r in result: print(" ".join(r[0]) + " - " + str(r[1]))
def main(): descriprions = get_vac_descriptions(get_connection(), 'data scientist') text = '' for descriprion in descriprions: text = text + descriprion[0] result = get_popular_phrase(text, 4, 20, stopwords) for r in result: print(" ".join(r[0]) + " - " + str(r[1])) main()
def get_stopwords(): descriptions = get_vac_descriptions(get_connection(), '') \ + get_vac_descriptions(get_connection(), '') + \ get_vac_descriptions(get_connection(), '') text = '' for description in descriptions: text = text + descriprion[0] stopwords = [] list = get_popular_phrase(text, 1, None, 200) # for i in list: stopwords.append(i[0][0]) return stopwords
for description in descriptions: if detect(description[0]) != 'en': text = text + description[0]
Source: https://habr.com/ru/post/337124/