I am a beginner at python and am trying to extract documents from websites.
I am trying to automate the extraction of a document ("fund factsheet") from websites. I want my program to (a) enter a website, (b) search for the word/link "factsheet / fact sheet", (c) enter the link, (d) download the PDF file onto my desktop.
My dataset is an Excel file with 3 columns. Col A is "fund_name", Col B is "website_url" and Col C is "factsheet_url_pattern".Image of dataset
"website_url" is when there is no direct link to the document. "factsheet_url_pattern" is when there is a direct link to the document
Assume the first website is: https://globalfunds.virtus.com/products/virtus-gf-us-small-cap-focus-fund#shareclass.I/period.quarterly
The program successfully opens the factsheet link but fails at saving it as a PDF to desktop
6. PROBLEM: at this stage, i have the link open, but i am unable to download the file as a PDF. "This is the error: Failed to download https://globalfunds.virtus.com/assets/files/67/virtus_gf_us_small_cap_focus_fund_factsheet_1116.pdf: Content is not a PDF file (Content-Type: text/html; charset=utf-8)"
Thank you!
The error the is generated trying to extract the factsheet:
Failed to download https://globalfunds.virtus.com/assets/files/67/virtus_gf_us_small_cap_focus_fund_factsheet_1116.pdf: Content is not a PDF file (Content-Type: text/html; charset=utf-8)
import pandas as pdimport osimport requestsfrom bs4 import BeautifulSoupfrom urllib.parse import urljoinimport re # For sanitizing file namesimport numpy as np # For handling NaN valuesdef download_file(url, file_path): try: # Send an HTTP GET request to the URL with headers headers = {'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36' ) } response = requests.get(url, stream=True, headers=headers, allow_redirects=True) print(f"HTTP Status Code: {response.status_code}") print(f"Final URL after redirects: {response.url}") response.raise_for_status() # Check if the request was successful # Check if the content type is PDF content_type = response.headers.get('Content-Type', '').lower() print(f"Content-Type: {content_type}") if 'pdf' not in content_type: print(f"Failed to download {url}: Content is not a PDF file (Content-Type: {content_type})") return False # Write the content to a file in chunks with open(file_path, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): if chunk: file.write(chunk) print(f"File downloaded successfully and saved to: {file_path}") return True except Exception as e: print(f"An error occurred: {e}") return Falsedef find_factsheet_link(website_url): try: headers = {'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ''(KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36' ) } response = requests.get(website_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') keywords = ['factsheet', 'fact sheet'] # Updated keywords list # Find all links on the page for a in soup.find_all('a', href=True): href = a['href'] text = a.get_text() # Check if any keyword is in the href or the text of the link if any(keyword.lower() in href.lower() or keyword.lower() in text.lower() for keyword in keywords): # Build the full URL if not href.startswith('http'): href = urljoin(website_url, href) # Only consider URLs that end with .pdf if href.lower().endswith('.pdf'): return href print(f"Could not find a PDF factsheet link on {website_url}") return None except Exception as e: print(f"Error accessing {website_url}: {e}") return Nonedef main(): # Use a fixed download folder download_folder = 'Factsheets' if not os.path.exists(download_folder): os.makedirs(download_folder) # Read funds from Excel file df = pd.read_excel('funds.xlsx') # Replace NaN values with empty strings df = df.replace({np.nan: ''}) funds = df.to_dict('records') for fund in funds: # Extract and sanitize fund data fund_name = str(fund.get('fund_name', '')).strip() website_url = str(fund.get('website_url', '')).strip() factsheet_url_pattern = str(fund.get('factsheet_url_pattern', '')).strip() # Check if fund_name and website_url are not empty if not fund_name or not website_url: print(f"Missing fund_name or website_url for fund: {fund}") continue print(f"\nProcessing {fund_name}") if factsheet_url_pattern and factsheet_url_pattern.lower() != 'nan': # Use the provided factsheet URL factsheet_url = factsheet_url_pattern else: # Try to find the factsheet link factsheet_url = find_factsheet_link(website_url) if not factsheet_url: print(f"Could not find factsheet link for {fund_name}") continue # Print the factsheet URL being used print(f"Factsheet URL for {fund_name}: {factsheet_url}") # Download the file file_extension = os.path.splitext(factsheet_url)[1] if not file_extension: file_extension = '.pdf' # Default to .pdf file_name = f"{fund_name}{file_extension}" # Sanitize the file name to remove any illegal characters sanitized_file_name = re.sub(r'[\\/*?:"<>|]', "", file_name) file_path = os.path.join(download_folder, sanitized_file_name) success = download_file(factsheet_url, file_path) if success: print(f"Downloaded {sanitized_file_name}") else: print(f"Failed to download factsheet for {fund_name}")if __name__ == '__main__': main()