Quantcast
Channel: Active questions tagged selenium - Stack Overflow
Viewing all articles
Browse latest Browse all 98265

Automate the extraction of documents on from website

$
0
0

I am a beginner at python and am trying to extract documents from websites.

  1. I am trying to automate the extraction of a document ("fund factsheet") from websites. I want my program to (a) enter a website, (b) search for the word/link "factsheet / fact sheet", (c) enter the link, (d) download the PDF file onto my desktop.

  2. My dataset is an Excel file with 3 columns. Col A is "fund_name", Col B is "website_url" and Col C is "factsheet_url_pattern".Image of dataset

  3. "website_url" is when there is no direct link to the document. "factsheet_url_pattern" is when there is a direct link to the document

  4. Assume the first website is: https://globalfunds.virtus.com/products/virtus-gf-us-small-cap-focus-fund#shareclass.I/period.quarterly

  5. The program successfully opens the factsheet link but fails at saving it as a PDF to desktop

6. PROBLEM: at this stage, i have the link open, but i am unable to download the file as a PDF. "This is the error: Failed to download https://globalfunds.virtus.com/assets/files/67/virtus_gf_us_small_cap_focus_fund_factsheet_1116.pdf: Content is not a PDF file (Content-Type: text/html; charset=utf-8)"

Thank you!

The error the is generated trying to extract the factsheet:

Failed to download https://globalfunds.virtus.com/assets/files/67/virtus_gf_us_small_cap_focus_fund_factsheet_1116.pdf: Content is not a PDF file (Content-Type: text/html; charset=utf-8)

import pandas as pdimport osimport requestsfrom bs4 import BeautifulSoupfrom urllib.parse import urljoinimport re  # For sanitizing file namesimport numpy as np  # For handling NaN valuesdef download_file(url, file_path):    try:        # Send an HTTP GET request to the URL with headers        headers = {'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'            )        }        response = requests.get(url, stream=True, headers=headers, allow_redirects=True)        print(f"HTTP Status Code: {response.status_code}")        print(f"Final URL after redirects: {response.url}")        response.raise_for_status()  # Check if the request was successful        # Check if the content type is PDF        content_type = response.headers.get('Content-Type', '').lower()        print(f"Content-Type: {content_type}")        if 'pdf' not in content_type:            print(f"Failed to download {url}: Content is not a PDF file (Content-Type: {content_type})")            return False        # Write the content to a file in chunks        with open(file_path, 'wb') as file:            for chunk in response.iter_content(chunk_size=8192):                if chunk:                    file.write(chunk)        print(f"File downloaded successfully and saved to: {file_path}")        return True    except Exception as e:        print(f"An error occurred: {e}")        return Falsedef find_factsheet_link(website_url):    try:        headers = {'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'            )        }        response = requests.get(website_url, headers=headers)        response.raise_for_status()        soup = BeautifulSoup(response.content, 'html.parser')        keywords = ['factsheet', 'fact sheet']  # Updated keywords list        # Find all links on the page        for a in soup.find_all('a', href=True):            href = a['href']            text = a.get_text()            # Check if any keyword is in the href or the text of the link            if any(keyword.lower() in href.lower() or keyword.lower() in text.lower() for keyword in keywords):                # Build the full URL                if not href.startswith('http'):                    href = urljoin(website_url, href)                # Only consider URLs that end with .pdf                if href.lower().endswith('.pdf'):                    return href        print(f"Could not find a PDF factsheet link on {website_url}")        return None    except Exception as e:        print(f"Error accessing {website_url}: {e}")        return Nonedef main():    # Use a fixed download folder    download_folder = 'Factsheets'    if not os.path.exists(download_folder):        os.makedirs(download_folder)    # Read funds from Excel file    df = pd.read_excel('funds.xlsx')    # Replace NaN values with empty strings    df = df.replace({np.nan: ''})    funds = df.to_dict('records')    for fund in funds:        # Extract and sanitize fund data        fund_name = str(fund.get('fund_name', '')).strip()        website_url = str(fund.get('website_url', '')).strip()        factsheet_url_pattern = str(fund.get('factsheet_url_pattern', '')).strip()        # Check if fund_name and website_url are not empty        if not fund_name or not website_url:            print(f"Missing fund_name or website_url for fund: {fund}")            continue        print(f"\nProcessing {fund_name}")        if factsheet_url_pattern and factsheet_url_pattern.lower() != 'nan':            # Use the provided factsheet URL            factsheet_url = factsheet_url_pattern        else:            # Try to find the factsheet link            factsheet_url = find_factsheet_link(website_url)            if not factsheet_url:                print(f"Could not find factsheet link for {fund_name}")                continue        # Print the factsheet URL being used        print(f"Factsheet URL for {fund_name}: {factsheet_url}")        # Download the file        file_extension = os.path.splitext(factsheet_url)[1]        if not file_extension:            file_extension = '.pdf'  # Default to .pdf        file_name = f"{fund_name}{file_extension}"        # Sanitize the file name to remove any illegal characters        sanitized_file_name = re.sub(r'[\\/*?:"<>|]', "", file_name)        file_path = os.path.join(download_folder, sanitized_file_name)        success = download_file(factsheet_url, file_path)        if success:            print(f"Downloaded {sanitized_file_name}")        else:            print(f"Failed to download factsheet for {fund_name}")if __name__ == '__main__':    main()

Viewing all articles
Browse latest Browse all 98265

Trending Articles