My goal is to scrape tax data from public county websites. While using scrapy and selenium I was able to gather parcel ids, addresses and owner names but my output returned null for elements located in a table that disappears when I disable javascript.
Each search result from the particular county website returns a list of properties with links to their tax details. It is within these links where the data is I am attempting to scrape.
What is the best way to gather the land_value and total_appr? Again, when I disabled javascript from the chrome tools the values were invisible so do I have to use selenium in another for loop? How do I go about doing this? Thanks
from abc import ABC
from time import sleep
import scrapy
from scrapy import Spider
from scrapy.spiders import CrawlSpider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
class PropertiesSpider(Spider, ABC):
name = 'properties'
allowed_domains = ['gwinnettassessor.manatron.com/IWantTo/PropertyGISSearch.aspx']
def start_requests(self):
self.driver = webdriver.Chrome('/Users/wenzeljoe/CondaProjects/chromedriver')
self.driver.get('http://gwinnettassessor.manatron.com/IWantTo/PropertyGISSearch.aspx')
search_bar = self.driver.find_element_by_xpath("//table/tbody/tr/td/div/input")
search_bar.send_keys("smith")
button = self.driver.find_element_by_xpath("//button/span")
button.click()
sel = Selector(text=self.driver.page_source)
properties = sel.xpath("//ul[2]/li/a/@href").extract()
for property in properties:
url = 'http://gwinnettassessor.manatron.com/IWantTo/PropertyGISSearch.aspx' + property
yield Request(url, callback=self.parse_property)
def parse_property(self, response):
parcel_id = response.xpath(
'//table[@class="ui-widget-content ui-table generalinfo"]/tbody/tr[2]/td/text()').extract_first()
address = response.xpath(
'//table[@class="ui-widget-content ui-table generalinfo"]/tbody/tr[4]/td/text()').extract_first()
name = response.xpath(
'//table[@class="ui-widget-content ui-table generalinfo"]/tbody/tr/td/text()').extract_first()
land_value = response.xpath("//table[@class='ui-table']/tbody/tr[3]/td/text()").extract_first()
total_appr = response.xpath("//td[@class='ui-widget String ui-state-default']/text()").extract_first()
yield {
'parcel_id': parcel_id,
'address': address,
'owner_name': name,
'land_value': land_value,
'total_appraisal': total_appr
}