import re import requests import html import codecs from bs4 import BeautifulSoup def extract_francetvinfo_image_url(page_content): image_url = None r_hashtags = None html_source = page_content # Find the index of " 720w" in the HTML source index = html_source.find(" 720w") if index != -1: # Find the index of the preceding "https" starting from the found index start_index = html_source.rfind("https", 0, index) if start_index != -1: # Extract the URL image_url = html_source[start_index:index].strip() else: image_url = None else: image_url = None # Now extract hashtags match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content) if match and match.group(1): keywords_str = match.group(1) keywords_unescaped = html.unescape(keywords_str) keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape') keywords = re.findall(r'([^,]+)', keywords_decoded) hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] if hashtags: r_hashtags = hashtags[:5] else: r_hashtags = None else: r_hashtags = None return image_url, r_hashtags def extract_vancouver_image_url(page_content): image_url = None r_hashtags = None context = page_content regex_pattern = r'imagesrcset="([^"]+?),' matches = re.search(regex_pattern, context) if matches: image_url = matches.group(1) else: image_url = None # Now extract hashtags match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content) if match: keywords_str = match.group(1) keywords = re.findall(r'"([^"]+)"', keywords_str) hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] if hashtags: r_hashtags = hashtags[:5] else: r_hashtags = None return image_url, r_hashtags def extract_bbc_image_url(page_content): image_url = None r_hashtags = None soup = BeautifulSoup(page_content, 'html.parser') img_tag = soup.find('img') image_url = None if img_tag: srcset = img_tag.get('srcset') urls = re.findall(r'(https?://[^\s,]+\.jpg) (\d+)w', str(srcset)) urls = sorted(urls, key=lambda x: int(x[1]), reverse=True) # Sort URLs by width in descending order for url, width in urls: if int(width) <= 480: print(url) image_url = url else: image_url = None soup = BeautifulSoup(page_content, 'html.parser') keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0') keywords = [tag.text for tag in keyword_tags] hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] if hashtags: r_hashtags = hashtags[:5] else: r_hashtags = None return image_url, r_hashtags def extract_androidauthority_image_url(page_content): image_url = None r_hashtags = None html_content = page_content match = re.search(r'imageSrcSet="(.*?)"', html_content) if match: image_srcset = match.group(1) urls = re.findall(r'(https?://[^\s,]+\.webp.*?)\s(\d+)w', image_srcset) for url, width in urls: if int(width) == 712: image_url = url html_content = page_content match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content) if match: keywords_str = match.group(1) keywords_list = re.findall(r'"([^"]+)"', keywords_str) hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list] if hashtags: r_hashtags = hashtags[:5] else: r_hashtags = None return image_url, r_hashtags def extract_theguardian_image_url(page_content): image_url = None r_hashtags = None match = re.search(r'(?<=src=")(https?://.*?\.jpg)', page_content) if match: image_url = match.group(0) image_url = match.group(0) + "?width=620&dpr=1&s=none" else: image_url = None match = re.search(r'"keywords":"([^"]+)"', page_content) if match and match.group(1): keywords_str = match.group(1) # keywords = re.findall(r'\b(\w+)\b', keywords_str) # hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] keywords = [keyword.strip() for keyword in keywords_str.split(',')] hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] if hashtags: r_hashtags = hashtags[:5] else: r_hashtags = None return image_url, r_hashtags def extract_cbc_image_url(page_content): image_url = None r_hashtags = None soup = BeautifulSoup(page_content, 'html.parser') image_tag = soup.find('img', alt=True) if image_tag: image_url = image_tag['src'] else: image_url = None start_index = page_content.find('"gs_keywords":["') if start_index != -1: start_index += len('"gs_keywords":["') end_index = page_content.find('"]', start_index) if end_index != -1: keywords = page_content[start_index:end_index].split('","') hashtags = ['#' + keyword for keyword in keywords] r_hashtags = hashtags[:5] else: r_hashtags = None return image_url, r_hashtags def extract_techrepublic_image_url(page_content): image_url = None r_hashtags = None pattern = r'', page_content) if keywords_str and keywords_str.group(1): keywords = keywords_str.group(1).split(',') hashtags = ['#' + keyword.strip().replace(' ', '') for keyword in keywords] if hashtags: r_hashtags = hashtags[:5] else: r_hashtags = None return image_url, r_hashtags def extract_lithub_image_url(page_content): image_url = None r_hashtags = None html_source = page_content pattern = r'