From 261899ddce7f000de3b5d8c2ca6c73c065769190 Mon Sep 17 00:00:00 2001 From: fossilfranv Date: Wed, 20 Sep 2023 16:03:23 -0700 Subject: [PATCH] Upload files to "" --- image_utils.py | 284 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 image_utils.py diff --git a/image_utils.py b/image_utils.py new file mode 100644 index 0000000..b468804 --- /dev/null +++ b/image_utils.py @@ -0,0 +1,284 @@ +import re +import requests +import html +import codecs +from bs4 import BeautifulSoup + +def extract_francetvinfo_image_url(page_content): + image_url = None + r_hashtags = None + + + html_source = page_content + + # Find the index of " 720w" in the HTML source + index = html_source.find(" 720w") + + if index != -1: + # Find the index of the preceding "https" starting from the found index + start_index = html_source.rfind("https", 0, index) + + if start_index != -1: + # Extract the URL + image_url = html_source[start_index:index].strip() + else: + image_url = None + else: + image_url = None + + # Now extract hashtags + match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content) + if match and match.group(1): + keywords_str = match.group(1) + keywords_unescaped = html.unescape(keywords_str) + keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape') + keywords = re.findall(r'([^,]+)', keywords_decoded) + hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] + if hashtags: + r_hashtags = hashtags[:5] + else: + r_hashtags = None + else: + r_hashtags = None + return image_url, r_hashtags + + +def extract_vancouver_image_url(page_content): + image_url = None + r_hashtags = None + + context = page_content + regex_pattern = r'imagesrcset="([^"]+?),' + matches = re.search(regex_pattern, context) + + if matches: + image_url = matches.group(1) + else: + image_url = None + + # Now extract hashtags + match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content) + if match: + keywords_str = match.group(1) + keywords = re.findall(r'"([^"]+)"', keywords_str) + hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] + if hashtags: + r_hashtags = hashtags[:5] + else: + r_hashtags = None + + return image_url, r_hashtags + + +def extract_bbc_image_url(page_content): + + image_url = None + r_hashtags = None + soup = BeautifulSoup(page_content, 'html.parser') + + img_tag = soup.find('img') + image_url = None + if img_tag: + srcset = img_tag.get('srcset') + urls = re.findall(r'(https?://[^\s,]+\.jpg) (\d+)w', str(srcset)) + urls = sorted(urls, key=lambda x: int(x[1]), reverse=True) # Sort URLs by width in descending order + + for url, width in urls: + if int(width) <= 480: + print(url) + image_url = url + else: + image_url = None + + soup = BeautifulSoup(page_content, 'html.parser') + keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0') + keywords = [tag.text for tag in keyword_tags] + hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] + if hashtags: + r_hashtags = hashtags[:5] + else: + r_hashtags = None + + return image_url, r_hashtags + + + +def extract_androidauthority_image_url(page_content): + + image_url = None + r_hashtags = None + html_content = page_content + + match = re.search(r'imageSrcSet="(.*?)"', html_content) + if match: + image_srcset = match.group(1) + urls = re.findall(r'(https?://[^\s,]+\.webp.*?)\s(\d+)w', image_srcset) + for url, width in urls: + if int(width) == 712: + image_url = url + + + html_content = page_content + match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content) + if match: + keywords_str = match.group(1) + keywords_list = re.findall(r'"([^"]+)"', keywords_str) + hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list] + if hashtags: + r_hashtags = hashtags[:5] + else: + r_hashtags = None + + return image_url, r_hashtags + + +def extract_theguardian_image_url(page_content): + + image_url = None + r_hashtags = None + + match = re.search(r'(?<=src=")(https?://.*?\.jpg)', page_content) + if match: + image_url = match.group(0) + image_url = match.group(0) + "?width=620&dpr=1&s=none" + else: + image_url = None + + match = re.search(r'"keywords":"([^"]+)"', page_content) + if match and match.group(1): + keywords_str = match.group(1) + # keywords = re.findall(r'\b(\w+)\b', keywords_str) + # hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] + keywords = [keyword.strip() for keyword in keywords_str.split(',')] + hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords] + if hashtags: + r_hashtags = hashtags[:5] + else: + r_hashtags = None + return image_url, r_hashtags + + +def extract_cbc_image_url(page_content): + + image_url = None + r_hashtags = None + + soup = BeautifulSoup(page_content, 'html.parser') + + image_tag = soup.find('img', alt=True) + if image_tag: + image_url = image_tag['src'] + else: + image_url = None + + start_index = page_content.find('"gs_keywords":["') + if start_index != -1: + start_index += len('"gs_keywords":["') + end_index = page_content.find('"]', start_index) + if end_index != -1: + keywords = page_content[start_index:end_index].split('","') + hashtags = ['#' + keyword for keyword in keywords] + r_hashtags = hashtags[:5] + else: + r_hashtags = None + return image_url, r_hashtags + +def extract_techrepublic_image_url(page_content): + + image_url = None + r_hashtags = None + + pattern = r'', page_content) + if keywords_str and keywords_str.group(1): + keywords = keywords_str.group(1).split(',') + hashtags = ['#' + keyword.strip().replace(' ', '') for keyword in keywords] + + if hashtags: + r_hashtags = hashtags[:5] + else: + r_hashtags = None + + return image_url, r_hashtags + + +def extract_lithub_image_url(page_content): + + image_url = None + r_hashtags = None + + html_source = page_content + + pattern = r'