mastodon_news_poster/image_utils.py

import re
import requests
import html
import codecs
from bs4 import BeautifulSoup

def extract_francetvinfo_image_url(page_content):
    image_url = None
    r_hashtags = None


    html_source = page_content

    # Find the index of " 720w" in the HTML source
    index = html_source.find(" 720w")

    if index != -1:
        # Find the index of the preceding "https" starting from the found index
        start_index = html_source.rfind("https", 0, index)

        if start_index != -1:
            # Extract the URL
            image_url = html_source[start_index:index].strip()
        else:
            image_url = None
    else:
        image_url = None

    # Now extract hashtags
    match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
    if match and match.group(1):
        keywords_str = match.group(1)
        keywords_unescaped = html.unescape(keywords_str)
        keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
        keywords = re.findall(r'([^,]+)', keywords_decoded)
        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
        if hashtags:
            r_hashtags =  hashtags[:5]
        else:
            r_hashtags = None
    else:
        r_hashtags = None
    return image_url, r_hashtags


def extract_vancouver_image_url(page_content):
    image_url = None
    r_hashtags = None

    context = page_content
    regex_pattern = r'imagesrcset="([^"]+?),'
    matches = re.search(regex_pattern, context)

    if matches:
        image_url = matches.group(1)
    else:
        image_url = None

    # Now extract hashtags
    match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
    if match:
        keywords_str = match.group(1)
        keywords = re.findall(r'"([^"]+)"', keywords_str)
        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
        if hashtags:
            r_hashtags =  hashtags[:5]
        else:
            r_hashtags = None

    return image_url, r_hashtags


def extract_bbc_image_url(page_content):

    image_url = None
    r_hashtags = None
    soup = BeautifulSoup(page_content, 'html.parser')

    img_tag = soup.find('img')
    image_url = None
    if img_tag:
        srcset = img_tag.get('srcset')
        urls = re.findall(r'(https?://[^\s,]+\.jpg) (\d+)w', str(srcset))
        urls = sorted(urls, key=lambda x: int(x[1]), reverse=True)  # Sort URLs by width in descending order

        for url, width in urls:
            if int(width) <= 480:
                print(url)
                image_url = url
            else:
                image_url = None

    soup = BeautifulSoup(page_content, 'html.parser')
    keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
    keywords = [tag.text for tag in keyword_tags]
    hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
    if hashtags:
        r_hashtags =  hashtags[:5]
    else:
        r_hashtags = None

    return image_url, r_hashtags


def extract_androidauthority_image_url(page_content):

    image_url = None
    r_hashtags = None
    html_content = page_content

    match = re.search(r'imageSrcSet="(.*?)"', html_content)
    if match:
        image_srcset = match.group(1)
        urls = re.findall(r'(https?://[^\s,]+\.webp.*?)\s(\d+)w', image_srcset)
        for url, width in urls:
            if int(width) == 712:
                image_url =  url
            else:
                image_url = None

    html_content = page_content
    match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
    if match:
        keywords_str = match.group(1)
        keywords_list = re.findall(r'"([^"]+)"', keywords_str)
        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
        if hashtags:
            r_hashtags =  hashtags[:5]
        else:
            r_hashtags = None

    return image_url, r_hashtags


def extract_theguardian_image_url(page_content):

    image_url = None
    r_hashtags = None

    match = re.search(r'(?<=src=")(https?://.*?\.jpg)', page_content)
    if match:
        image_url = match.group(0)
        image_url = match.group(0) + "?width=620&dpr=1&s=none"
    else:
        image_url = None

    match = re.search(r'"keywords":"([^"]+)"', page_content)
    if match and match.group(1):
        keywords_str = match.group(1)
        # keywords = re.findall(r'\b(\w+)\b', keywords_str)
        # hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
        keywords = [keyword.strip() for keyword in keywords_str.split(',')]
        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
        if hashtags:
            r_hashtags =  hashtags[:5]
        else:
            r_hashtags = None
    return image_url, r_hashtags


def extract_cbc_image_url(page_content):

    image_url = None
    r_hashtags = None

    soup = BeautifulSoup(page_content, 'html.parser')

    image_tag = soup.find('img', alt=True)
    if image_tag:
        image_url = image_tag['src']
    else:
        image_url = None

    start_index = page_content.find('"gs_keywords":["')
    if start_index != -1:
        start_index += len('"gs_keywords":["')
        end_index = page_content.find('"]', start_index)
        if end_index != -1:
            keywords = page_content[start_index:end_index].split('","')
            hashtags = ['#' + keyword for keyword in keywords]
            r_hashtags =  hashtags[:5]
        else:
            r_hashtags = None
    return image_url, r_hashtags

def extract_techrepublic_image_url(page_content):

    image_url = None
    r_hashtags = None

    pattern = r'<meta property="og:image" content="([^"]+?)"'
    match = re.search(pattern, str(page_content))
    if match:
        image_url = match.group(1)
    else:
        image_url = None


    return None

def extract_time_image_url(page_content):

    image_url = None
    r_hashtags = None

    pattern = r'"image":\[\{"@type":"ImageObject","url":"([^"]+)"'
    match = re.search(pattern, page_content)
    if match:
        image_url = match.group(1)
    else:
        image_url = None

    pattern = r'"keywords":\s*\[("[\w\s]+"(?:,\s*"[^"]+")*)\]'
    matches = re.search(pattern, page_content)

    if matches:
        keywords = [keyword.strip('"') for keyword in matches.group(1).split(',')]
        hashtags = ['#' + keyword.replace(' ', '_') for keyword in keywords]
        r_hashtags =  hashtags[:5]
    return image_url, r_hashtags


def extract_wired_image_url(page_content):

    image_url = None
    r_hashtags = None

    html_source = page_content

    # Find the index of " 640w" in the HTML source
    index = html_source.find("w_640")


    if index != -1:
        # Find the index of the preceding "https" starting from the found index
        start_index = html_source.rfind("https", 0, index)

        if start_index != -1:
            # Extract the URL
            image_url = html_source[start_index:index + 5].strip()
            return image_url

        else:
            print("No image URL found")
            return None
    else:
        print("No image URL found")
        return None


def extract_lithub_image_url(page_content):

    image_url = None
    r_hashtags = None

    html_source = page_content

    pattern = r'<meta name="twitter:image" content="(\S+)"'
    match = re.search(pattern, html_source)
    if match:
        image_url = match.group(1)
        print(image_url)
        # input("Press Enter to continue... This was lithub image_url")
        return image_url
    else:
        print("Image URL not found")


''' def extract_androidauthority_hashtags(page_content):
    html_content = page_content

    match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
    if match:
        keywords_str = match.group(1)
        keywords_list = re.findall(r'"([^"]+)"', keywords_str)
        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
        if hashtags:
            return hashtags[:5]
    return [] '''

''' def extract_vancouversun_hashtags(page_content):

    match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
    if match:
        keywords_str = match.group(1)
        keywords = re.findall(r'"([^"]+)"', keywords_str)
        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
        if hashtags:
            return hashtags[:5]
    return [] '''

''' def extract_francetvinfo_hashtags(page_content):
    match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
    if match and match.group(1):
        keywords_str = match.group(1)
        keywords_unescaped = html.unescape(keywords_str)
        keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
        keywords = re.findall(r'([^,]+)', keywords_decoded)
        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
        if hashtags:
            return hashtags[:5]
    return [] '''

''' def extract_theguardian_hashtags(page_content):
    match = re.search(r'"keywords":"([^"]+)"', page_content)
    if match and match.group(1):
        keywords_str = match.group(1)
        # keywords = re.findall(r'\b(\w+)\b', keywords_str)
        # hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
        keywords = [keyword.strip() for keyword in keywords_str.split(',')]
        hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
        if hashtags:
            return hashtags[:5]
    return [] '''


''' def extract_bbc_hashtags(page_content):
    soup = BeautifulSoup(page_content, 'html.parser')
    keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
    keywords = [tag.text for tag in keyword_tags]
    hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
    if hashtags:
        return hashtags[:5]
    return [] '''

''' def extract_cbc_hashtags(source_content):
    start_index = source_content.find('"gs_keywords":["')
    if start_index != -1:
        start_index += len('"gs_keywords":["')
        end_index = source_content.find('"]', start_index)
        if end_index != -1:
            keywords = source_content[start_index:end_index].split('","')
            hashtags = ['#' + keyword for keyword in keywords]
            return hashtags[:5]
    return [] '''

''' def extract_time_hashtags(source_content):
    pattern = r'"keywords":\s*\[("[\w\s]+"(?:,\s*"[^"]+")*)\]'
    matches = re.search(pattern, source_content)

    if matches:
        keywords = [keyword.strip('"') for keyword in matches.group(1).split(',')]
        hashtags = ['#' + keyword.replace(' ', '_') for keyword in keywords]
        return hashtags[:5]

    return [] '''