Trying to scrape floor sizes (in sq ft) and lot sizes (in hectares) from listings on a real estate website using Beautiful Soup and Selenium.
The floor sizes print fine in the console

but when writing to a csv file the the 'sq ft' info under the floor size column is not extracted

It seems if 'sq ft' is found by BS4 in the ID element after the the one stipulated, that is returned instead and all other 'sq ft' text is passed over on every other url when writing to the csv. As you can see on (image) two of the listings have this, despite those two links having hectares as well:
http://property.shw.co.uk/propertyInfo/11080/145151-London-Road-Croydon--CR0-2RGhttp://property.shw.co.uk/propertyInfo/16162/Public-HouseRestaurant-Site-Westvale-Park-Horley-Surrey--RH6-0HJ
Can someone explain why the sq ft are printed on the console but not written to the csv? Any help would be appreciated.
Relevant HTML where CP2_CPContent_conDetails1_divDetails is relevant locator for floor sizes and lot sizes:
<div id="CP2_CPContent_conDetails1_divDetails">
0.3 Acres <br>(0.12 Hectares)
<div class="clear"></div>
<div id="CP2_CPContent_conDetails1_divDes" class="divInfo">
Potential building size of 6,458 sq ft (600 sq m)<br>
</div>
Code as follows:
driver = webdriver.Chrome()
shw_search_url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(shw_search_url)
#identify and extract listing links from each page
def get_house_links(url, driver, pages=3):
house_links = []
driver.get(url)
for i in range(pages):
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all("a", class_="L")
page_data = [row['href'] for row in listings]
house_links.append(page_data)
time.sleep(np.random.lognormal(0, 1))
next_button = soup.select('img[src*="propNext"]')
if next_button:
next_button = next_button[0].find_parent('a')
next_button_link = 'http://property.shw.co.uk' + next_button['href']
driver.get(next_button_link)
return house_links
#get html data from url and return as object
def get_html_data(url, driver):
driver.get(url)
time.sleep(np.random.lognormal(0,1))
soup = BeautifulSoup(driver.page_source, 'html.parser')
return soup
def get_lot_size(soup):
try:
for element in soup.find_all('div', {'id':'CP2_CPContent_conDetails1_divDetails'}):
lot_size = element.find_next(text=re.compile('Hectares'))
lot_size = lot_size.replace("(", "").replace(")", "")
print(lot_size)
return lot_size
except:
return 'NA'
def get_floor_size(soup):
try:
for element in soup.find('div', {'id': 'CP2_CPContent_conDetails1_divDetails'}):
floor_size = element.find_next(text=re.compile('sq ft'))
print(floor_size)
return floor_size
except:
return 'NA'
def flatten_list(house_links):
house_links_flat = []
for sublist in house_links:
for item in sublist:
house_links_flat.append(item)
return house_links_flat
def get_house_data(driver, house_links_flat):
house_data = []
for link in house_links_flat:
soup = get_html_data(link, driver)
floor_size = get_floor_size(soup)
lot_size = get_lot_size(soup)
house_data.append([floor_size, lot_size])
return house_data
house_links_3pages = get_house_links(shw_search_url,driver,pages=3)
house_links_flat = flatten_list(house_links_3pages)
house_data_3pages = get_house_data(driver,house_links_flat)
#open and write results to csv
file_name = "SHW %s_%s.csv" % (str(time.strftime("%Y-%m-%d")),
str(time.strftime("%H:%M%S")))
columns = ["Floor_Size", "Lot_Size"]
pd.DataFrame(house_data_3pages, columns = columns).to_csv(
file_name, index = False, encoding = "UTF-8"
)