I would appreciate with helping me scrape results from the following link: sample link
I'm using Python 3.7, BeautifulSoup 4 and Selenium.
I have coded a program to extract features of user reviews for hotels such as reviewer name, review date, reviewer score, reviewer country, staying date, review title as well as the review itself (in this case, the review is separated into a positive and negative section). I use BeautifulSoup 4 to extract the text from the HTML tags, relying on Selenium to click on the 'cookie notification' button as well as loop through the page results.
While I am successfully looping through the page results, I am not extracting the content retrieved after the first page onwards. Each N pages retrieves the same content from the first results page, I'm betting this might be because the contents are loaded via JQuery. At this points I'm not sure what the behaviour is and what I either need to look for in the page source or how to go about finding a solution.
Any hints or advice would be highly appreciated!
Crude snippet of my code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome('/Users/admin/Desktop/chrome_driver/chromedriver')
#initiate driver-browser via Selenium - with original url
driver.get('link1')
def acceptCookies():
time.sleep(3)
element = driver.find_elements_by_xpath("//button[@class='cookie-warning-v2__banner-cta bui-button bui-button--wide bui-button--secondary close_warning']")
if element != None:
element = driver.find_elements_by_xpath("//button[@class='cookie-warning-v2__banner-cta bui-button bui-button--wide bui-button--secondary close_warning']")
element[0].click()
def getData(count, soup):
try:
for line in soup.find_all('li', class_='review_item'):
count += 1
review={}
review["review_metadata"]={}
review["review_metadata"]["review_date"] = line.find('p', class_='review_item_date').text.strip()
if line.find('p', class_='review_staydate') != None:
review["review_metadata"]["review_staydate"] = line.find('p', class_='review_staydate').text.strip()
review["review_metadata"]["reviewer_name"] = line.find('p', class_='reviewer_name').text.strip()
print(review["review_metadata"]["reviewer_name"])
review["review_metadata"]["reviewer_country"] = line.find('span', class_='reviewer_country').text.strip()
review["review_metadata"]["reviewer_score"] = line.find('span', class_='review-score-badge').text.strip()
if line.find('p', class_='review_pos') != None:
review["review_metadata"]["review_pos"] = line.find('p', class_='review_pos').text.strip()
if line.find('p', class_='review_neg') != None:
review["review_metadata"]["review_neg"] = line.find('p', class_='review_neg').text.strip()
scoreword = line.find('span', class_='review_item_header_scoreword')
if scoreword != None :
review["review_metadata"]["review_header"] = scoreword.text.strip()
else:
review["review_metadata"]["review_header"] = ""
hotel_reviews[count] = review
return hotel_reviews
except Exception as e:
return print('the error is', e)
# Finds max-range of pagination (number of result pages retrieved)
def find_max_pages():
max_pages = driver.find_elements_by_xpath("//div[@class='bui-pagination__list']//div//span")
max_pages = max_pages[-1].text
max_pages = max_pages.split()
max_pages = int(max_pages[1])
return max_pages
hotel_reviews= {}
count = 0
review_page = {}
hotel_reviews_2 = []
# Accept on Cookie-Notification
acceptCookies()
# Find Max Pages
max_pages = find_max_pages()
# Find every pagination link in order to loop through each review page carousel
element = driver.find_elements_by_xpath("//a[@class='bui-pagination__link']")
for item in range(max_pages-1):
review_page = getData(count, soup)
hotel_reviews_2.extend(review_page)
time.sleep(2)
element = driver.find_elements_by_xpath("//a[@class='bui-pagination__link']")
element[item].click()
driver.get(url=driver.current_url)
print(driver.page_source)
print(driver.current_url)
soup = BeautifulSoup(driver.page_source, 'lxml')