from time import sleep, strftime from random import randint import pandas as pd from selenium import webdriver from selenium.webdriver.common.keys import Keys import smtplib from email.mime.multipart import MIMEMultipart # chromedriver! chromedriver_path = 'C:/{YOUR PATH HERE}/chromedriver_win32/chromedriver.exe' driver = webdriver.Chrome(executable_path=chromedriver_path) # Chrome sleep(2)
randint
is used to ensure that the bot would āfall asleepā for a random number of seconds before starting a new search operation. Usually, no bot can do without it. If you run the above code, the Chrome window will open, which the bot will use to work with sites.kayak
variable storing the URL is declared and the get
method of the web driver is used. After clicking on the search button, the results should appear on the page.get
command more than two or three times in a few minutes, I was offered to undergo testing using reCaptcha. This test can be passed manually and continue the experiments until the system decides to arrange a new test. When I tested the script, I had the feeling that the first search session always runs without problems, so if you want to experiment with the code, you only have to periodically manually check and leave the code to run, using long intervals between the search sessions. Yes, and if you think about it, a person is unlikely to need information about ticket prices obtained at 10-minute intervals between search operations. //*[@id="wtKI-price_aTab"]/div[1]/div/div/div[1]/div/span/span
cheap_results = '//a[@data-code = "price"]'
data-code
attribute equal to price
. When using the first option, the search for the element id
is equal to wtKI-price_aTab
, while the XPath path to the element looks like /div[1]/div/div/div[1]/div/span/span
. Such an XPath request to the page will do its job, but only once. I can say right now that id
will change the next time the page loads. The wtKI
symbol wtKI
changes dynamically each time a page is loaded, and as a result, the code in which it is used will be useless after another page reload. So take some time to figure out the XPath. This knowledge will serve you well.resultWrapper
class. All results can be loaded in a loop that resembles the one shown below.flight_containers
used, then flights_list
is flights_list
).try
block, which I added because sometimes the button does not load normally. If you also come across this, comment out the calls to this function in the start_kayak
function start_kayak
, which we will discuss below. # , def load_more(): try: more_results = '//a[@class = "moreButton"]' driver.find_element_by_xpath(more_results).click() # , print('sleeping.....') sleep(randint(45,60)) except: pass
page_scrape
. Sometimes the returned data about the stages of the path are combined, for their separation I use a simple method. For example, when I use the variables section_a_list
and section_b_list
for the first time. Our function returns the flights_df
, which allows us to separate the results obtained using different data sorting methods, and later to combine them. def page_scrape(): """This function takes care of the scraping part""" xp_sections = '//*[@class="section duration"]' sections = driver.find_elements_by_xpath(xp_sections) sections_list = [value.text for value in sections] section_a_list = sections_list[::2] # section_b_list = sections_list[1::2] # reCaptcha, - . # , - , , # if - # , # SystemExit if section_a_list == []: raise SystemExit # A B a_duration = [] a_section_names = [] for n in section_a_list: # a_section_names.append(''.join(n.split()[2:5])) a_duration.append(''.join(n.split()[0:2])) b_duration = [] b_section_names = [] for n in section_b_list: # b_section_names.append(''.join(n.split()[2:5])) b_duration.append(''.join(n.split()[0:2])) xp_dates = '//div[@class="section date"]' dates = driver.find_elements_by_xpath(xp_dates) dates_list = [value.text for value in dates] a_date_list = dates_list[::2] b_date_list = dates_list[1::2] # a_day = [value.split()[0] for value in a_date_list] a_weekday = [value.split()[1] for value in a_date_list] b_day = [value.split()[0] for value in b_date_list] b_weekday = [value.split()[1] for value in b_date_list] # xp_prices = '//a[@class="booking-link"]/span[@class="price option-text"]' prices = driver.find_elements_by_xpath(xp_prices) prices_list = [price.text.replace('$','') for price in prices if price.text != ''] prices_list = list(map(int, prices_list)) # stops - , , - xp_stops = '//div[@class="section stops"]/div[1]' stops = driver.find_elements_by_xpath(xp_stops) stops_list = [stop.text[0].replace('n','0') for stop in stops] a_stop_list = stops_list[::2] b_stop_list = stops_list[1::2] xp_stops_cities = '//div[@class="section stops"]/div[2]' stops_cities = driver.find_elements_by_xpath(xp_stops_cities) stops_cities_list = [stop.text for stop in stops_cities] a_stop_name_list = stops_cities_list[::2] b_stop_name_list = stops_cities_list[1::2] # -, xp_schedule = '//div[@class="section times"]' schedules = driver.find_elements_by_xpath(xp_schedule) hours_list = [] carrier_list = [] for schedule in schedules: hours_list.append(schedule.text.split('\n')[0]) carrier_list.append(schedule.text.split('\n')[1]) # a b a_hours = hours_list[::2] a_carrier = carrier_list[1::2] b_hours = hours_list[::2] b_carrier = carrier_list[1::2] cols = (['Out Day', 'Out Time', 'Out Weekday', 'Out Airline', 'Out Cities', 'Out Duration', 'Out Stops', 'Out Stop Cities', 'Return Day', 'Return Time', 'Return Weekday', 'Return Airline', 'Return Cities', 'Return Duration', 'Return Stops', 'Return Stop Cities', 'Price']) flights_df = pd.DataFrame({'Out Day': a_day, 'Out Weekday': a_weekday, 'Out Duration': a_duration, 'Out Cities': a_section_names, 'Return Day': b_day, 'Return Weekday': b_weekday, 'Return Duration': b_duration, 'Return Cities': b_section_names, 'Out Stops': a_stop_list, 'Out Stop Cities': a_stop_name_list, 'Return Stops': b_stop_list, 'Return Stop Cities': b_stop_name_list, 'Out Time': a_hours, 'Out Airline': a_carrier, 'Return Time': b_hours, 'Return Airline': b_carrier, 'Price': prices_list})[cols] flights_df['timestamp'] = strftime("%Y%m%d-%H%M") # return flights_df
a
refer to the first stage of the path, and b
to the second. Go to the next function.start_kayak
function, which we will now consider.kayak
variable, which is used to go to the page where the search results will be located, sorted by their best match to the query. After the first scraping session, we will work with the prices in the table at the top of the page. Namely, we find the minimum ticket price and the average price. All this, together with the prediction issued by the site, will be sent by e-mail. On the page the corresponding table should be in the upper left corner. Working with this table, by the way, can cause an error when searching using exact dates, since in this case the table on the page is not displayed. def start_kayak(city_from, city_to, date_start, date_end): """City codes - it's the IATA codes! Date format - YYYY-MM-DD""" kayak = ('https://www.kayak.com/flights/' + city_from + '-' + city_to + '/' + date_start + '-flexible/' + date_end + '-flexible?sort=bestflight_a') driver.get(kayak) sleep(randint(8,10)) # , try try: xp_popup_close = '//button[contains(@id,"dialog-close") and contains(@class,"Button-No-Standard-Style close ")]' driver.find_elements_by_xpath(xp_popup_close)[5].click() except Exception as e: pass sleep(randint(60,95)) print('loading more.....') # load_more() print('starting first scrape.....') df_flights_best = page_scrape() df_flights_best['sort'] = 'best' sleep(randint(60,80)) # , matrix = driver.find_elements_by_xpath('//*[contains(@id,"FlexMatrixCell")]') matrix_prices = [price.text.replace('$','') for price in matrix] matrix_prices = list(map(int, matrix_prices)) matrix_min = min(matrix_prices) matrix_avg = sum(matrix_prices)/len(matrix_prices) print('switching to cheapest results.....') cheap_results = '//a[@data-code = "price"]' driver.find_element_by_xpath(cheap_results).click() sleep(randint(60,90)) print('loading more.....') # load_more() print('starting second scrape.....') df_flights_cheap = page_scrape() df_flights_cheap['sort'] = 'cheap' sleep(randint(60,80)) print('switching to quickest results.....') quick_results = '//a[@data-code = "duration"]' driver.find_element_by_xpath(quick_results).click() sleep(randint(60,90)) print('loading more.....') # load_more() print('starting third scrape.....') df_flights_fast = page_scrape() df_flights_fast['sort'] = 'fast' sleep(randint(60,80)) # Excel-, final_df = df_flights_cheap.append(df_flights_best).append(df_flights_fast) final_df.to_excel('search_backups//{}_flights_{}-{}_from_{}_to_{}.xlsx'.format(strftime("%Y%m%d-%H%M"), city_from, city_to, date_start, date_end), index=False) print('saved df.....') # , , , xp_loading = '//div[contains(@id,"advice")]' loading = driver.find_element_by_xpath(xp_loading).text xp_prediction = '//span[@class="info-text"]' prediction = driver.find_element_by_xpath(xp_prediction).text print(loading+'\n'+prediction) # loading , , , # - "Not Sure" weird = 'ĀÆ\\_(ć)_/ĀÆ' if loading == weird: loading = 'Not sure' username = 'YOUREMAIL@hotmail.com' password = 'YOUR PASSWORD' server = smtplib.SMTP('smtp.outlook.com', 587) server.ehlo() server.starttls() server.login(username, password) msg = ('Subject: Flight Scraper\n\n\ Cheapest Flight: {}\nAverage Price: {}\n\nRecommendation: {}\n\nEnd of message'.format(matrix_min, matrix_avg, (loading+'\n'+prediction))) message = MIMEMultipart() message['From'] = 'YOUREMAIL@hotmail.com' message['to'] = 'YOUROTHEREMAIL@domain.com' server.sendmail('YOUREMAIL@hotmail.com', 'YOUROTHEREMAIL@domain.com', msg) print('sent email.....')
city_from = input('From which city? ') city_to = input('Where to? ') date_start = input('Search around which departure date? Please use YYYY-MM-DD format only ') date_end = input('Return when? Please use YYYY-MM-DD format only ') # city_from = 'LIS' # city_to = 'SIN' # date_start = '2019-08-21' # date_end = '2019-09-07' for n in range(0,5): start_kayak(city_from, city_to, date_start, date_end) print('iteration {} was complete @ {}'.format(n, strftime("%Y%m%d-%H%M"))) # 4 sleep(60*60*4) print('sleep finished.....')
Source: https://habr.com/ru/post/451872/
All Articles