From 666565411c3d03d1a34726dd0f0e528af7dea18c Mon Sep 17 00:00:00 2001 From: fossilfranv Date: Wed, 20 Sep 2023 16:02:08 -0700 Subject: [PATCH] Delete image_utils.py --- image_utils.py | 348 ------------------------------------------------- 1 file changed, 348 deletions(-) delete mode 100644 image_utils.py diff --git a/image_utils.py b/image_utils.py deleted file mode 100644 index ecfd841..0000000 --- a/image_utils.py +++ /dev/null @@ -1,348 +0,0 @@ -import re -import requests -import html -import codecs -from bs4 import BeautifulSoup - -def extract_francetvinfo_image_url(page_content): - image_url = None - r_hashtags = None - - - html_source = page_content - - # Find the index of " 720w" in the HTML source - index = html_source.find(" 720w") - - if index != -1: - # Find the index of the preceding "https" starting from the found index - start_index = html_source.rfind("https", 0, index) - - if start_index != -1: - # Extract the URL - image_url = html_source[start_index:index].strip() - else: - image_url = None - else: - image_url = None - - # Now extract hashtags - match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content) - if match and match.group(1): - keywords_str = match.group(1) - keywords_unescaped = html.unescape(keywords_str) - keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape') - keywords = re.findall(r'([^,]+)', keywords_decoded) - hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] - if hashtags: - r_hashtags = hashtags[:5] - else: - r_hashtags = None - else: - r_hashtags = None - return image_url, r_hashtags - - -def extract_vancouver_image_url(page_content): - image_url = None - r_hashtags = None - - context = page_content - regex_pattern = r'imagesrcset="([^"]+?),' - matches = re.search(regex_pattern, context) - - if matches: - image_url = matches.group(1) - else: - image_url = None - - # Now extract hashtags - match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content) - if match: - keywords_str = match.group(1) - keywords = re.findall(r'"([^"]+)"', keywords_str) - hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] - if hashtags: - r_hashtags = hashtags[:5] - else: - r_hashtags = None - - return image_url, r_hashtags - - -def extract_bbc_image_url(page_content): - - image_url = None - r_hashtags = None - soup = BeautifulSoup(page_content, 'html.parser') - - img_tag = soup.find('img') - image_url = None - if img_tag: - srcset = img_tag.get('srcset') - urls = re.findall(r'(https?://[^\s,]+\.jpg) (\d+)w', str(srcset)) - urls = sorted(urls, key=lambda x: int(x[1]), reverse=True) # Sort URLs by width in descending order - - for url, width in urls: - if int(width) <= 480: - print(url) - image_url = url - else: - image_url = None - - soup = BeautifulSoup(page_content, 'html.parser') - keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0') - keywords = [tag.text for tag in keyword_tags] - hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] - if hashtags: - r_hashtags = hashtags[:5] - else: - r_hashtags = None - - return image_url, r_hashtags - - - -def extract_androidauthority_image_url(page_content): - - image_url = None - r_hashtags = None - html_content = page_content - - match = re.search(r'imageSrcSet="(.*?)"', html_content) - if match: - image_srcset = match.group(1) - urls = re.findall(r'(https?://[^\s,]+\.webp.*?)\s(\d+)w', image_srcset) - for url, width in urls: - if int(width) == 712: - image_url = url - else: - image_url = None - - html_content = page_content - match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content) - if match: - keywords_str = match.group(1) - keywords_list = re.findall(r'"([^"]+)"', keywords_str) - hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list] - if hashtags: - r_hashtags = hashtags[:5] - else: - r_hashtags = None - - return image_url, r_hashtags - - -def extract_theguardian_image_url(page_content): - - image_url = None - r_hashtags = None - - match = re.search(r'(?<=src=")(https?://.*?\.jpg)', page_content) - if match: - image_url = match.group(0) - image_url = match.group(0) + "?width=620&dpr=1&s=none" - else: - image_url = None - - match = re.search(r'"keywords":"([^"]+)"', page_content) - if match and match.group(1): - keywords_str = match.group(1) - # keywords = re.findall(r'\b(\w+)\b', keywords_str) - # hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] - keywords = [keyword.strip() for keyword in keywords_str.split(',')] - hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] - if hashtags: - r_hashtags = hashtags[:5] - else: - r_hashtags = None - return image_url, r_hashtags - - -def extract_cbc_image_url(page_content): - - image_url = None - r_hashtags = None - - soup = BeautifulSoup(page_content, 'html.parser') - - image_tag = soup.find('img', alt=True) - if image_tag: - image_url = image_tag['src'] - else: - image_url = None - - start_index = page_content.find('"gs_keywords":["') - if start_index != -1: - start_index += len('"gs_keywords":["') - end_index = page_content.find('"]', start_index) - if end_index != -1: - keywords = page_content[start_index:end_index].split('","') - hashtags = ['#' + keyword for keyword in keywords] - r_hashtags = hashtags[:5] - else: - r_hashtags = None - return image_url, r_hashtags - -def extract_techrepublic_image_url(page_content): - - image_url = None - r_hashtags = None - - pattern = r'