Quantcast
Channel: Active questions tagged selenium - Stack Overflow
Viewing all articles
Browse latest Browse all 97807

Scraping CNN with Scrapy and Selenium

$
0
0

I'd like to create a highly automated scraper, which would be able to open the search results page of cnn.com (that's why I need Selenium), extract some information from each article, then get to the next page, however, with little to no success so far.

Currently my code looks like this (I know, it's probably terrible, it's a patchwork of other spiders I found):

    import scrapy
from scrapy import signals
from scrapy.http import TextResponse 
from scrapy.xlib.pydispatch import dispatcher
from cnn.items import CNNitem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class CNNspider(CrawlSpider):
    name = "cnn_spider"
    allowed_domains = ['cnn.com']
    start_urls = ['https://www.cnn.com/search?q=elizabeth%20warren&size=10&page=1']
    rules = [
        Rule(LinkExtractor(restrict_xpaths='//div[@class="cnn-search__results-list"]//h3/a/@href'), callback='parse_post', follow= True),
        ]

    def __init__(self, *a, **kw):
        self.driver = webdriver.Chrome()
        super(CNNspider, self).__init__(*a, **kw)



    def parse_page(self, response):
        # selenium part of the job
        self.driver.get(response.url)
        while True:
            more_btn = WebDriverWait(self.driver, 10).until(
                EC.visibility_of_element_located((By.XPATH, "//div[@class='pagination-bar']/div[contains(text(), 'Next')]"))
            )

            more_btn.click()

            # stop when we reach the desired page
            if self.driver.current_url.endswith('page=161'):
                break

        # now scrapy should do the job
        response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
        for post in response.xpath('//div[@class="cnn-search__results-list"]/div[@class="cnn-search__result cnn-search__result--article"]'):
            item = CNNitem()
            item['Title'] = post.xpath('.//h3[@class="cnn-search__result-headline"]/a/text()').extract()
            item['Link'] = post.xpath('.//h3[@class="cnn-search__result-headline"]/a/@href').extract()

            yield scrapy.Request(item['Link'], meta={'item': item}, callback=self.parse_post)

    def parse_post(self, response):
        item = response.meta['item']
        item["Body"] = response.xpath('//section[@id="body-text"]/div[1]/div/text()').extract()
        return item

Right now what Chrome does is it opens the first page, and almost immediately closes it, without doing anything. Can someone help me with putting this together?


Viewing all articles
Browse latest Browse all 97807

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>