I have a script that scrapes a web site and tests all the links in it ha it finds. My issue is that when a come across a link wih a double forward slash (like //us.cnn.com), my script fails.
Here is the code my script fails on:
elif "//" in link.get('href'):
link = "http:" + str(link)
print("tested link is: " + link)
driver = webdriver.Chrome(
'/home/ironmantis7x/PycharmProjects/WebScrapper/chromedriver')
#driver.get(link)
#driver.get(str(link))
driver.get(link.get('href'))
what I am trying to do is when it comes across a link that has "//" (double slashes), i just want to send that link with http: added to it so selenium opens that complete link (http://cnn.com/us for example).
How can I accomplish this correctly?
Here is the complete script in case I is needed for reference.
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import sys
import time
from datetime import date
from datetime import datetime
import datetime
# chrome browser control options
options = webdriver.ChromeOptions()
options.add_argument('headless')
# options.add_argument('--ignore-certificate-errors')
# options.add_argument("--test-type")
options.binary_location = "/usr/bin/google-chrome" # <--- needed actual path to chrome browser
# hard set path to chromedriver in project
# driver = webdriver.Chrome('/home/ironmantis7x/Documents/BSSLLC/projects/PycharmProjects/WebScrapper/chromedriver')
# system time for time/date stamping
now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# fetching url to test
url = raw_input("Enter a website to extract the URL's from: ")
r = requests.get("http://" + url)
data = r.text
# soup = BeautifulSoup(data)
soup = BeautifulSoup(data, 'html.parser')
validURL = 0
validChildURL = 0
invalidURL = 0
for link in soup.find_all('a'):
if "http" in link.get('href'):
driver = webdriver.Chrome(
'/home/ironmantis7x/PycharmProjects/WebScrapper/chromedriver')
driver.get(link.get('href'))
print(driver.title)
with open('valid_link_2.txt', 'a') as f:
print >> f, 'link:', link.get('href')
print(link.get('href'))
driver.get(link.get('href'))
driver.quit()
validURL = validURL + 1
elif "https" in link.get('href'):
driver = webdriver.Chrome(
'/home/ironmantis7x/PycharmProjects/WebScrapper/chromedriver')
driver.get(link.get('href'))
print(driver.title)
with open('valid_link_2.txt', 'a') as f:
print >> f, 'link:', link.get('href')
print(link.get('href'))
driver.get(link.get('href'))
driver.quit()
validURL = validURL + 1
elif "//" in link.get('href'):
link = "http:" + str(link)
print("tested link is: " + link)
driver = webdriver.Chrome(
'/home/ironmantis7x/PycharmProjects/WebScrapper/chromedriver')
driver.get(link.get('href'))
print(driver.title)
with open('valid_link.txt', 'a') as f:
print >> f, 'link:', link.get('href')
print(link.get('href'))
driver.get(link.get('href'))
driver.quit()
validURL = validURL + 1
else:
print(link.get('href') + " is an invalid link")
with open('invalid_link.txt', 'a') as f:
print >> f, link.get('href')
print(link.get('href'))
driver = webdriver.Chrome('/home/ironmantis7x/PycharmProjects/WebScrapper/chromedriver',
chrome_options=options)
driver.quit()
invalidURL = invalidURL + 1