Automate the extraction of documents on from website

I am a beginner at python and am trying to extract documents from websites.

I am trying to automate the extraction of a document ("fund factsheet") from websites. I want my program to (a) enter a website, (b) search for the word/link "factsheet / fact sheet", (c) enter the link, (d) download the PDF file onto my desktop.
My dataset is an Excel file with 3 columns. Col A is "fund_name", Col B is "website_url" and Col C is "factsheet_url_pattern".Image of dataset
"website_url" is when there is no direct link to the document. "factsheet_url_pattern" is when there is a direct link to the document
Assume the first website is: https://globalfunds.virtus.com/products/virtus-gf-us-small-cap-focus-fund#shareclass.I/period.quarterly
The program successfully opens the factsheet link but fails at saving it as a PDF to desktop

6. PROBLEM: at this stage, i have the link open, but i am unable to download the file as a PDF. "This is the error: Failed to download https://globalfunds.virtus.com/assets/files/67/virtus_gf_us_small_cap_focus_fund_factsheet_1116.pdf: Content is not a PDF file (Content-Type: text/html; charset=utf-8)"

Thank you!

The error the is generated trying to extract the factsheet:

Failed to download https://globalfunds.virtus.com/assets/files/67/virtus_gf_us_small_cap_focus_fund_factsheet_1116.pdf: Content is not a PDF file (Content-Type: text/html; charset=utf-8)

import pandas as pdimport osimport requestsfrom bs4 import BeautifulSoupfrom urllib.parse import urljoinimport re  # For sanitizing file namesimport numpy as np  # For handling NaN valuesdef download_file(url, file_path):    try:        # Send an HTTP GET request to the URL with headers        headers = {'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'            )        }        response = requests.get(url, stream=True, headers=headers, allow_redirects=True)        print(f"HTTP Status Code: {response.status_code}")        print(f"Final URL after redirects: {response.url}")        response.raise_for_status()  # Check if the request was successful        # Check if the content type is PDF        content_type = response.headers.get('Content-Type', '').lower()        print(f"Content-Type: {content_type}")        if 'pdf' not in content_type:            print(f"Failed to download {url}: Content is not a PDF file (Content-Type: {content_type})")            return False        # Write the content to a file in chunks        with open(file_path, 'wb') as file:            for chunk in response.iter_content(chunk_size=8192):                if chunk:                    file.write(chunk)        print(f"File downloaded successfully and saved to: {file_path}")        return True    except Exception as e:        print(f"An error occurred: {e}")        return Falsedef find_factsheet_link(website_url):    try:        headers = {'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'            )        }        response = requests.get(website_url, headers=headers)        response.raise_for_status()        soup = BeautifulSoup(response.content, 'html.parser')        keywords = ['factsheet', 'fact sheet']  # Updated keywords list        # Find all links on the page        for a in soup.find_all('a', href=True):            href = a['href']            text = a.get_text()            # Check if any keyword is in the href or the text of the link            if any(keyword.lower() in href.lower() or keyword.lower() in text.lower() for keyword in keywords):                # Build the full URL                if not href.startswith('http'):                    href = urljoin(website_url, href)                # Only consider URLs that end with .pdf                if href.lower().endswith('.pdf'):                    return href        print(f"Could not find a PDF factsheet link on {website_url}")        return None    except Exception as e:        print(f"Error accessing {website_url}: {e}")        return Nonedef main():    # Use a fixed download folder    download_folder = 'Factsheets'    if not os.path.exists(download_folder):        os.makedirs(download_folder)    # Read funds from Excel file    df = pd.read_excel('funds.xlsx')    # Replace NaN values with empty strings    df = df.replace({np.nan: ''})    funds = df.to_dict('records')    for fund in funds:        # Extract and sanitize fund data        fund_name = str(fund.get('fund_name', '')).strip()        website_url = str(fund.get('website_url', '')).strip()        factsheet_url_pattern = str(fund.get('factsheet_url_pattern', '')).strip()        # Check if fund_name and website_url are not empty        if not fund_name or not website_url:            print(f"Missing fund_name or website_url for fund: {fund}")            continue        print(f"\nProcessing {fund_name}")        if factsheet_url_pattern and factsheet_url_pattern.lower() != 'nan':            # Use the provided factsheet URL            factsheet_url = factsheet_url_pattern        else:            # Try to find the factsheet link            factsheet_url = find_factsheet_link(website_url)            if not factsheet_url:                print(f"Could not find factsheet link for {fund_name}")                continue        # Print the factsheet URL being used        print(f"Factsheet URL for {fund_name}: {factsheet_url}")        # Download the file        file_extension = os.path.splitext(factsheet_url)[1]        if not file_extension:            file_extension = '.pdf'  # Default to .pdf        file_name = f"{fund_name}{file_extension}"        # Sanitize the file name to remove any illegal characters        sanitized_file_name = re.sub(r'[\\/*?:"<>|]', "", file_name)        file_path = os.path.join(download_folder, sanitized_file_name)        success = download_file(factsheet_url, file_path)        if success:            print(f"Downloaded {sanitized_file_name}")        else:            print(f"Failed to download factsheet for {fund_name}")if __name__ == '__main__':    main()

Automate the extraction of documents on from website

Trending Articles

Mp3 Download: Mdu - Mazola

[MP3] Okpo Recordz Virus & Texzy –“Raba Raba” (Prod. by Exy Pro)

Missing boy, Queens Quay West and Bathurst Street area, Javin Dillon, 15

Karimnagar District Police Office Mobile Numbers List in Telangana State

Gulabi kallu Lyrics and translation | GAV / Govindhudu andhari vadele (2014)

SAHARA FLASH LIVE IN WERAGOLLA 2018-04-20

99 God Status for Whatsapp, Facebook

Grimsby sex-swap teen Nicole beats the bullies

Portable iSkysoft PDF Editor 5.6.0.1

New curfew for accused Brathwaite

Chitown Wiseguy Cashed In His Chips In Winter Of ’20, Made Bones In Chicago...

Black Angus Grilled Artichokes

Materials Around Us Class 6 Worksheet Science Chapter 6

GTA 5 PPSSPP Zip File Download For Android Mediafire 382 MB

Moondru Mudichu 20-07-2016 – Polimer tv Serial

Practice Sheet of Right form of verbs for HSC Students

VMOU RSCIT Result 2017, RSCIT Result VMOU rkcl.vmou.ac.in Name Wise

AVS4YOU Products Patcher v1.4 By RADIXX11

Troubleshooting Connectivity #9 –ローカル接続でネットワークエラーとはこれいかに？

Bureau of Internal Revenue: Regional Offices (Directory)