Delete image_utils.py

2023-09-20 16:02:08 -07:00 · 2023-09-20 16:02:08 -07:00 · 666565411c
commit 666565411c
parent 11c3d1c6b7
1 changed files with 0 additions and 348 deletions
--- a/image_utils.py
+++ b/image_utils.py
@ -1,348 +0,0 @@
-import re
-import requests
-import html
-import codecs
-from bs4 import BeautifulSoup
-
-def extract_francetvinfo_image_url(page_content):
-    image_url = None
-    r_hashtags = None
-
-
-    html_source = page_content
-
-    # Find the index of " 720w" in the HTML source
-    index = html_source.find(" 720w")
-
-    if index != -1:
-        # Find the index of the preceding "https" starting from the found index
-        start_index = html_source.rfind("https", 0, index)
-
-        if start_index != -1:
-            # Extract the URL
-            image_url = html_source[start_index:index].strip()
-        else:
-            image_url = None
-    else:
-        image_url = None
-    
-    # Now extract hashtags
-    match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
-    if match and match.group(1):
-        keywords_str = match.group(1)
-        keywords_unescaped = html.unescape(keywords_str)
-        keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
-        keywords = re.findall(r'([^,]+)', keywords_decoded)
-        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
-        if hashtags:
-            r_hashtags =  hashtags[:5]
-        else:
-            r_hashtags = None
-    else:
-        r_hashtags = None
-    return image_url, r_hashtags
-
-
-def extract_vancouver_image_url(page_content):
-    image_url = None
-    r_hashtags = None
-
-    context = page_content
-    regex_pattern = r'imagesrcset="([^"]+?),'
-    matches = re.search(regex_pattern, context)
-
-    if matches:
-        image_url = matches.group(1)
-    else:
-        image_url = None
-
-    # Now extract hashtags
-    match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
-    if match:
-        keywords_str = match.group(1)
-        keywords = re.findall(r'"([^"]+)"', keywords_str)
-        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
-        if hashtags:
-            r_hashtags =  hashtags[:5]
-        else:
-            r_hashtags = None
-
-    return image_url, r_hashtags
-
-
-def extract_bbc_image_url(page_content):
-
-    image_url = None
-    r_hashtags = None
-    soup = BeautifulSoup(page_content, 'html.parser')
-
-    img_tag = soup.find('img')
-    image_url = None
-    if img_tag:
-        srcset = img_tag.get('srcset')
-        urls = re.findall(r'(https?://[^\s,]+\.jpg) (\d+)w', str(srcset))
-        urls = sorted(urls, key=lambda x: int(x[1]), reverse=True)  # Sort URLs by width in descending order
-
-        for url, width in urls:
-            if int(width) <= 480:
-                print(url)
-                image_url = url
-            else:
-                image_url = None
-        
-    soup = BeautifulSoup(page_content, 'html.parser')
-    keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
-    keywords = [tag.text for tag in keyword_tags]
-    hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
-    if hashtags:
-        r_hashtags =  hashtags[:5]
-    else:
-        r_hashtags = None
-
-    return image_url, r_hashtags
-
-
-
-def extract_androidauthority_image_url(page_content):
-
-    image_url = None
-    r_hashtags = None
-    html_content = page_content
-
-    match = re.search(r'imageSrcSet="(.*?)"', html_content)
-    if match:
-        image_srcset = match.group(1)
-        urls = re.findall(r'(https?://[^\s,]+\.webp.*?)\s(\d+)w', image_srcset)
-        for url, width in urls:
-            if int(width) == 712:
-                image_url =  url
-            else:
-                image_url = None
-
-    html_content = page_content
-    match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
-    if match:
-        keywords_str = match.group(1)
-        keywords_list = re.findall(r'"([^"]+)"', keywords_str)
-        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
-        if hashtags:
-            r_hashtags =  hashtags[:5]
-        else:
-            r_hashtags = None
-
-    return image_url, r_hashtags
-
-
-def extract_theguardian_image_url(page_content):
-
-    image_url = None
-    r_hashtags = None
-
-    match = re.search(r'(?<=src=")(https?://.*?\.jpg)', page_content)
-    if match:
-        image_url = match.group(0)
-        image_url = match.group(0) + "?width=620&dpr=1&s=none"
-    else:
-        image_url = None
-
-    match = re.search(r'"keywords":"([^"]+)"', page_content)
-    if match and match.group(1):
-        keywords_str = match.group(1)
-        # keywords = re.findall(r'\b(\w+)\b', keywords_str)
-        # hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
-        keywords = [keyword.strip() for keyword in keywords_str.split(',')]
-        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
-        if hashtags:
-            r_hashtags =  hashtags[:5]
-        else:
-            r_hashtags = None
-    return image_url, r_hashtags
-
-
-def extract_cbc_image_url(page_content):
-
-    image_url = None
-    r_hashtags = None
-
-    soup = BeautifulSoup(page_content, 'html.parser')
-
-    image_tag = soup.find('img', alt=True)
-    if image_tag:
-        image_url = image_tag['src']
-    else:
-        image_url = None
-    
-    start_index = page_content.find('"gs_keywords":["')
-    if start_index != -1:
-        start_index += len('"gs_keywords":["')
-        end_index = page_content.find('"]', start_index)
-        if end_index != -1:
-            keywords = page_content[start_index:end_index].split('","')
-            hashtags = ['#' + keyword for keyword in keywords]
-            r_hashtags =  hashtags[:5]
-        else:
-            r_hashtags = None
-    return image_url, r_hashtags
-
-def extract_techrepublic_image_url(page_content):
-
-    image_url = None
-    r_hashtags = None
-
-    pattern = r'<meta property="og:image" content="([^"]+?)"'
-    match = re.search(pattern, str(page_content))
-    if match:
-        image_url = match.group(1)
-    else:
-        image_url = None
-    
-    
-    return None
-
-def extract_time_image_url(page_content):
-
-    image_url = None
-    r_hashtags = None
-
-    pattern = r'"image":\[\{"@type":"ImageObject","url":"([^"]+)"'
-    match = re.search(pattern, page_content)
-    if match:
-        image_url = match.group(1)
-    else:
-        image_url = None
-
-    pattern = r'"keywords":\s*\[("[\w\s]+"(?:,\s*"[^"]+")*)\]'
-    matches = re.search(pattern, page_content)
-    
-    if matches:
-        keywords = [keyword.strip('"') for keyword in matches.group(1).split(',')]
-        hashtags = ['#' + keyword.replace(' ', '_') for keyword in keywords]
-        r_hashtags =  hashtags[:5]
-    return image_url, r_hashtags
-
-
-def extract_wired_image_url(page_content):
-
-    image_url = None
-    r_hashtags = None
-
-    html_source = page_content
-
-    # Find the index of " 640w" in the HTML source
-    index = html_source.find("w_640")
-
-
-    if index != -1:
-        # Find the index of the preceding "https" starting from the found index
-        start_index = html_source.rfind("https", 0, index)
-
-        if start_index != -1:
-            # Extract the URL
-            image_url = html_source[start_index:index + 5].strip()
-            return image_url
-
-        else:
-            print("No image URL found")
-            return None
-    else:
-        print("No image URL found")
-        return None
-
-
-
-def extract_lithub_image_url(page_content):
-
-    image_url = None
-    r_hashtags = None
-
-    html_source = page_content
-
-    pattern = r'<meta name="twitter:image" content="(\S+)"'
-    match = re.search(pattern, html_source)
-    if match:
-        image_url = match.group(1)
-        print(image_url)
-        # input("Press Enter to continue... This was lithub image_url")
-        return image_url
-    else:
-        print("Image URL not found")
-
-
-''' def extract_androidauthority_hashtags(page_content):
-    html_content = page_content
-
-    match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
-    if match:
-        keywords_str = match.group(1)
-        keywords_list = re.findall(r'"([^"]+)"', keywords_str)
-        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
-        if hashtags:
-            return hashtags[:5]
-    return [] '''
-
-''' def extract_vancouversun_hashtags(page_content):
-
-    match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
-    if match:
-        keywords_str = match.group(1)
-        keywords = re.findall(r'"([^"]+)"', keywords_str)
-        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
-        if hashtags:
-            return hashtags[:5]
-    return [] '''
-    
-''' def extract_francetvinfo_hashtags(page_content):
-    match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
-    if match and match.group(1):
-        keywords_str = match.group(1)
-        keywords_unescaped = html.unescape(keywords_str)
-        keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
-        keywords = re.findall(r'([^,]+)', keywords_decoded)
-        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
-        if hashtags:
-            return hashtags[:5]
-    return [] '''
-
-''' def extract_theguardian_hashtags(page_content):
-    match = re.search(r'"keywords":"([^"]+)"', page_content)
-    if match and match.group(1):
-        keywords_str = match.group(1)
-        # keywords = re.findall(r'\b(\w+)\b', keywords_str)
-        # hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
-        keywords = [keyword.strip() for keyword in keywords_str.split(',')]
-        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
-        if hashtags:
-            return hashtags[:5]
-    return [] '''
-
-
-''' def extract_bbc_hashtags(page_content):
-    soup = BeautifulSoup(page_content, 'html.parser')
-    keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
-    keywords = [tag.text for tag in keyword_tags]
-    hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
-    if hashtags:
-        return hashtags[:5]
-    return [] '''
-
-''' def extract_cbc_hashtags(source_content):
-    start_index = source_content.find('"gs_keywords":["')
-    if start_index != -1:
-        start_index += len('"gs_keywords":["')
-        end_index = source_content.find('"]', start_index)
-        if end_index != -1:
-            keywords = source_content[start_index:end_index].split('","')
-            hashtags = ['#' + keyword for keyword in keywords]
-            return hashtags[:5]
-    return [] '''
-
-''' def extract_time_hashtags(source_content):
-    pattern = r'"keywords":\s*\[("[\w\s]+"(?:,\s*"[^"]+")*)\]'
-    matches = re.search(pattern, source_content)
-    
-    if matches:
-        keywords = [keyword.strip('"') for keyword in matches.group(1).split(',')]
-        hashtags = ['#' + keyword.replace(' ', '_') for keyword in keywords]
-        return hashtags[:5]
-    
-    return [] '''