From 261899ddce7f000de3b5d8c2ca6c73c065769190 Mon Sep 17 00:00:00 2001
From: fossilfranv <aa@aa.ca>
Date: Wed, 20 Sep 2023 16:03:23 -0700
Subject: [PATCH] Upload files to ""

---
 image_utils.py | 284 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 image_utils.py

diff --git a/image_utils.py b/image_utils.py
new file mode 100644
index 0000000..b468804
--- /dev/null
+++ b/image_utils.py
@@ -0,0 +1,284 @@
+import re
+import requests
+import html
+import codecs
+from bs4 import BeautifulSoup
+
+def extract_francetvinfo_image_url(page_content):
+    image_url = None
+    r_hashtags = None
+
+
+    html_source = page_content
+
+    # Find the index of " 720w" in the HTML source
+    index = html_source.find(" 720w")
+
+    if index != -1:
+        # Find the index of the preceding "https" starting from the found index
+        start_index = html_source.rfind("https", 0, index)
+
+        if start_index != -1:
+            # Extract the URL
+            image_url = html_source[start_index:index].strip()
+        else:
+            image_url = None
+    else:
+        image_url = None
+    
+    # Now extract hashtags
+    match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
+    if match and match.group(1):
+        keywords_str = match.group(1)
+        keywords_unescaped = html.unescape(keywords_str)
+        keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
+        keywords = re.findall(r'([^,]+)', keywords_decoded)
+        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
+        if hashtags:
+            r_hashtags =  hashtags[:5]
+        else:
+            r_hashtags = None
+    else:
+        r_hashtags = None
+    return image_url, r_hashtags
+
+
+def extract_vancouver_image_url(page_content):
+    image_url = None
+    r_hashtags = None
+
+    context = page_content
+    regex_pattern = r'imagesrcset="([^"]+?),'
+    matches = re.search(regex_pattern, context)
+
+    if matches:
+        image_url = matches.group(1)
+    else:
+        image_url = None
+
+    # Now extract hashtags
+    match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
+    if match:
+        keywords_str = match.group(1)
+        keywords = re.findall(r'"([^"]+)"', keywords_str)
+        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
+        if hashtags:
+            r_hashtags =  hashtags[:5]
+        else:
+            r_hashtags = None
+
+    return image_url, r_hashtags
+
+
+def extract_bbc_image_url(page_content):
+
+    image_url = None
+    r_hashtags = None
+    soup = BeautifulSoup(page_content, 'html.parser')
+
+    img_tag = soup.find('img')
+    image_url = None
+    if img_tag:
+        srcset = img_tag.get('srcset')
+        urls = re.findall(r'(https?://[^\s,]+\.jpg) (\d+)w', str(srcset))
+        urls = sorted(urls, key=lambda x: int(x[1]), reverse=True)  # Sort URLs by width in descending order
+
+        for url, width in urls:
+            if int(width) <= 480:
+                print(url)
+                image_url = url
+            else:
+                image_url = None
+        
+    soup = BeautifulSoup(page_content, 'html.parser')
+    keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
+    keywords = [tag.text for tag in keyword_tags]
+    hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
+    if hashtags:
+        r_hashtags =  hashtags[:5]
+    else:
+        r_hashtags = None
+
+    return image_url, r_hashtags
+
+
+
+def extract_androidauthority_image_url(page_content):
+
+    image_url = None
+    r_hashtags = None
+    html_content = page_content
+
+    match = re.search(r'imageSrcSet="(.*?)"', html_content)
+    if match:
+        image_srcset = match.group(1)
+        urls = re.findall(r'(https?://[^\s,]+\.webp.*?)\s(\d+)w', image_srcset)
+        for url, width in urls:
+            if int(width) == 712:
+                image_url =  url
+
+
+    html_content = page_content
+    match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
+    if match:
+        keywords_str = match.group(1)
+        keywords_list = re.findall(r'"([^"]+)"', keywords_str)
+        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
+        if hashtags:
+            r_hashtags =  hashtags[:5]
+        else:
+            r_hashtags = None
+
+    return image_url, r_hashtags
+
+
+def extract_theguardian_image_url(page_content):
+
+    image_url = None
+    r_hashtags = None
+
+    match = re.search(r'(?<=src=")(https?://.*?\.jpg)', page_content)
+    if match:
+        image_url = match.group(0)
+        image_url = match.group(0) + "?width=620&dpr=1&s=none"
+    else:
+        image_url = None
+
+    match = re.search(r'"keywords":"([^"]+)"', page_content)
+    if match and match.group(1):
+        keywords_str = match.group(1)
+        # keywords = re.findall(r'\b(\w+)\b', keywords_str)
+        # hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
+        keywords = [keyword.strip() for keyword in keywords_str.split(',')]
+        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
+        if hashtags:
+            r_hashtags =  hashtags[:5]
+        else:
+            r_hashtags = None
+    return image_url, r_hashtags
+
+
+def extract_cbc_image_url(page_content):
+
+    image_url = None
+    r_hashtags = None
+
+    soup = BeautifulSoup(page_content, 'html.parser')
+
+    image_tag = soup.find('img', alt=True)
+    if image_tag:
+        image_url = image_tag['src']
+    else:
+        image_url = None
+    
+    start_index = page_content.find('"gs_keywords":["')
+    if start_index != -1:
+        start_index += len('"gs_keywords":["')
+        end_index = page_content.find('"]', start_index)
+        if end_index != -1:
+            keywords = page_content[start_index:end_index].split('","')
+            hashtags = ['#' + keyword for keyword in keywords]
+            r_hashtags =  hashtags[:5]
+        else:
+            r_hashtags = None
+    return image_url, r_hashtags
+
+def extract_techrepublic_image_url(page_content):
+
+    image_url = None
+    r_hashtags = None
+
+    pattern = r'<meta property="og:image" content="([^"]+?)"'
+    match = re.search(pattern, str(page_content))
+    if match:
+        image_url = match.group(1)
+
+    match = re.search(r'"keywords":\[\s*(".*?")\s*\]', page_content)
+    if match and match.group(1):
+        keywords_str = match.group(1)
+        keywords = re.findall(r'"([^"]+)"', keywords_str)
+        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords if keyword != 'keywords']
+        
+        if hashtags:
+            r_hashtags = hashtags[:5]
+        else:
+            r_hashtags = None
+    
+    return image_url, r_hashtags
+
+def extract_time_image_url(page_content):
+
+    image_url = None
+    r_hashtags = None
+
+    pattern = r'"image":\[\{"@type":"ImageObject","url":"([^"]+)"'
+    match = re.search(pattern, page_content)
+    if match:
+        image_url = match.group(1)
+    else:
+        image_url = None
+
+    pattern = r'"keywords":\s*\[("[\w\s]+"(?:,\s*"[^"]+")*)\]'
+    matches = re.search(pattern, page_content)
+    
+    if matches:
+        keywords = [keyword.strip('"') for keyword in matches.group(1).split(',')]
+        hashtags = ['#' + keyword.replace(' ', '_') for keyword in keywords]
+        r_hashtags =  hashtags[:5]
+
+    return image_url, r_hashtags
+
+
+def extract_wired_image_url(page_content):
+
+    image_url = None
+    r_hashtags = None
+
+    html_source = page_content
+
+    # Find the index of " 640w" in the HTML source
+    index = html_source.find("w_640")
+
+
+    if index != -1:
+        # Find the index of the preceding "https" starting from the found index
+        start_index = html_source.rfind("https", 0, index)
+
+        if start_index != -1:
+            # Extract the URL
+            image_url = html_source[start_index:index + 5].strip()
+        else:
+            print("No image URL found")
+    else:
+        print("No image URL found")
+
+    keywords_str = re.search(r'<meta name="keywords" content="([^"]+)"/>', page_content)
+    if keywords_str and keywords_str.group(1):
+        keywords = keywords_str.group(1).split(',')
+        hashtags = ['#' + keyword.strip().replace(' ', '') for keyword in keywords]
+        
+        if hashtags:
+            r_hashtags = hashtags[:5]
+        else:
+            r_hashtags = None
+
+    return image_url, r_hashtags
+
+
+def extract_lithub_image_url(page_content):
+
+    image_url = None
+    r_hashtags = None
+
+    html_source = page_content
+
+    pattern = r'<meta name="twitter:image" content="(\S+)"'
+    match = re.search(pattern, html_source)
+    if match:
+        image_url = match.group(1)
+        # input("Press Enter to continue... This was lithub image_url")
+    else:
+        print("Image URL not found")
+
+
+    return image_url, r_hashtags
\ No newline at end of file