I'd like to create a highly automated scraper, which would be able to open the search results page of cnn.com (that's why I need Selenium), extract some information from each article, then get to the next page, however, with little to no success so far.
Currently my code looks like this (I know, it's probably terrible, it's a patchwork of other spiders I found):
import scrapy
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.xlib.pydispatch import dispatcher
from cnn.items import CNNitem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class CNNspider(CrawlSpider):
name = "cnn_spider"
allowed_domains = ['cnn.com']
start_urls = ['https://www.cnn.com/search?q=elizabeth%20warren&size=10&page=1']
rules = [
Rule(LinkExtractor(restrict_xpaths='//div[@class="cnn-search__results-list"]//h3/a/@href'), callback='parse_post', follow= True),
]
def __init__(self, *a, **kw):
self.driver = webdriver.Chrome()
super(CNNspider, self).__init__(*a, **kw)
def parse_page(self, response):
# selenium part of the job
self.driver.get(response.url)
while True:
more_btn = WebDriverWait(self.driver, 10).until(
EC.visibility_of_element_located((By.XPATH, "//div[@class='pagination-bar']/div[contains(text(), 'Next')]"))
)
more_btn.click()
# stop when we reach the desired page
if self.driver.current_url.endswith('page=161'):
break
# now scrapy should do the job
response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
for post in response.xpath('//div[@class="cnn-search__results-list"]/div[@class="cnn-search__result cnn-search__result--article"]'):
item = CNNitem()
item['Title'] = post.xpath('.//h3[@class="cnn-search__result-headline"]/a/text()').extract()
item['Link'] = post.xpath('.//h3[@class="cnn-search__result-headline"]/a/@href').extract()
yield scrapy.Request(item['Link'], meta={'item': item}, callback=self.parse_post)
def parse_post(self, response):
item = response.meta['item']
item["Body"] = response.xpath('//section[@id="body-text"]/div[1]/div/text()').extract()
return item
Right now what Chrome does is it opens the first page, and almost immediately closes it, without doing anything. Can someone help me with putting this together?