I've tried running a script on Windows and on Ubuntu, both using Python 3 and the latest versions of geckodriver, resulting in differing behavior. The full script is given below.
I'm trying to get the data for several different tests from a test prep site. There are different subjects, each of which has a specialization, each of which has a practice-test, each of which has several questions. The scrape function walks through the steps to get data of each type.
subject <--- specialization <---- practice-test *------ question
The get_questions function is where the difference shows up:
- In Windows, it behaves as expected. After the last question's choice is clicked, it goes on to a results page.
In Ubuntu, when a choice is clicked on the last question, it reloads the last question and keeps clicking the same choice and reloading the same question.
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import pathlib import time import json import os driver=webdriver.Firefox(executable_path="./geckodriver.exe") wait = WebDriverWait(driver, 15) data=[] def setup(): driver.get('https://www.varsitytutors.com/practice-tests') try: go_away_1= driver.find_element_by_class_name("ub-emb-iframe") driver.execute_script("arguments[0].style.visibility='hidden'", go_away_1) go_away_2= driver.find_element_by_class_name("ub-emb-iframe-wrapper") driver.execute_script("arguments[0].style.visibility='hidden'", go_away_2) go_away_3= driver.find_element_by_class_name("ub-emb-visible") driver.execute_script("arguments[0].style.visibility='hidden'", go_away_3) except: pass def get_subjects(subs=[]): subject_clickables_xpath="/html/body/div[3]/div[9]/div/*/div[@data-subject]/div[1]" subject_clickables=driver.find_elements_by_xpath(subject_clickables_xpath) subject_names=map(lambda x : x.find_element_by_xpath('..').get_attribute('data-subject'), subject_clickables) subject_pairs=zip(subject_names, subject_clickables) return subject_pairs def get_specializations(subject): specialization_clickables_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]" specialization_names_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]/../.." specialization_names=map(lambda x : x.get_attribute('data-subject'), driver.find_elements_by_xpath(specialization_names_xpath)) specialization_clickables = driver.find_elements_by_xpath(specialization_clickables_xpath) specialization_pairs=zip(specialization_names, specialization_clickables) return specialization_pairs def get_practices(subject, specialization): practice_clickables_xpath="/html/body/div[3]/div[8]/div[3]/*/div[1]/a[1]" practice_names_xpath="//*/h3[@class='subject_header']" lengths_xpath="/html/body/div[3]/div[8]/div[3]/*/div[2]" lengths=map(lambda x : x.text, driver.find_elements_by_xpath(lengths_xpath)) print(lengths) practice_names=map(lambda x : x.text, driver.find_elements_by_xpath(practice_names_xpath)) practice_clickables = driver.find_elements_by_xpath(practice_clickables_xpath) practice_pairs=zip(practice_names, practice_clickables) return practice_pairs def remove_popup(): try: button=wait.until(EC.element_to_be_clickable((By.XPATH,"//button[contains(.,'No Thanks')]"))) button.location_once_scrolled_into_view button.click() except: print('could not find the popup') def get_questions(subject, specialization, practice): remove_popup() questions=[] current_question=None while True: question={} try: WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]"))) question_number=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]').text.replace('.','') question_pre=driver.find_element_by_class_name('question_pre') question_body=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[2]/p') answer_choices=driver.find_elements_by_class_name('question_row') answers=map(lambda x : x.text, answer_choices) question['id']=question_number question['pre']=question_pre.text question['body']=question_body.text question['answers']=list(answers) questions.append(question) choice=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"input.test_button"))) driver.execute_script("arguments[0].click();", choice[3]) time.sleep(3) except Exception as e: if 'results' in driver.current_url: driver.get(driver.current_url.replace('http://', 'https://')) # last question has been answered; record results remove_popup() pathlib.Path('data/'+subject+'/'+specialization).mkdir(parents=True, exist_ok=True) with open('data/'+subject+'/'+specialization+'/questions.json', 'w') as outfile: json.dump(list(questions), outfile) break else: driver.get(driver.current_url.replace('http://', 'https://')) return questions def scrape(): setup() subjects=get_subjects() for subject_name, subject_clickable in subjects: subject={} subject['name']=subject_name subject['specializations']=[] subject_clickable.click() subject_url=driver.current_url.replace('http://', 'https://') specializations=get_specializations(subject_name) for specialization_name, specialization_clickable in specializations: specialization={} specialization['name']=specialization_name specialization['practices']=[] specialization_clickable.click() specialization_url=driver.current_url.replace('http://', 'https://') practices=get_practices(subject_name, specialization_name) for practice_name, practice_clickable in practices: practice={} practice['name']=practice_name practice_clickable.click() questions=get_questions(subject_name, specialization_name, practice_name) practice['questions']=questions driver.get(specialization_url) driver.get(subject_url) data.append(subject) print(data) scrape()
Can anyone help me figure out what may be causing this?