mastodon_news_poster/image_utils.py
2023-09-05 10:55:51 -07:00

348 lines
10 KiB
Python

import re
import requests
import html
import codecs
from bs4 import BeautifulSoup
def extract_francetvinfo_image_url(page_content):
image_url = None
r_hashtags = None
html_source = page_content
# Find the index of " 720w" in the HTML source
index = html_source.find(" 720w")
if index != -1:
# Find the index of the preceding "https" starting from the found index
start_index = html_source.rfind("https", 0, index)
if start_index != -1:
# Extract the URL
image_url = html_source[start_index:index].strip()
else:
image_url = None
else:
image_url = None
# Now extract hashtags
match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
if match and match.group(1):
keywords_str = match.group(1)
keywords_unescaped = html.unescape(keywords_str)
keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
keywords = re.findall(r'([^,]+)', keywords_decoded)
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
if hashtags:
r_hashtags = hashtags[:5]
else:
r_hashtags = None
else:
r_hashtags = None
return image_url, r_hashtags
def extract_vancouver_image_url(page_content):
image_url = None
r_hashtags = None
context = page_content
regex_pattern = r'imagesrcset="([^"]+?),'
matches = re.search(regex_pattern, context)
if matches:
image_url = matches.group(1)
else:
image_url = None
# Now extract hashtags
match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
if match:
keywords_str = match.group(1)
keywords = re.findall(r'"([^"]+)"', keywords_str)
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
if hashtags:
r_hashtags = hashtags[:5]
else:
r_hashtags = None
return image_url, r_hashtags
def extract_bbc_image_url(page_content):
image_url = None
r_hashtags = None
soup = BeautifulSoup(page_content, 'html.parser')
img_tag = soup.find('img')
image_url = None
if img_tag:
srcset = img_tag.get('srcset')
urls = re.findall(r'(https?://[^\s,]+\.jpg) (\d+)w', str(srcset))
urls = sorted(urls, key=lambda x: int(x[1]), reverse=True) # Sort URLs by width in descending order
for url, width in urls:
if int(width) <= 480:
print(url)
image_url = url
else:
image_url = None
soup = BeautifulSoup(page_content, 'html.parser')
keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
keywords = [tag.text for tag in keyword_tags]
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
if hashtags:
r_hashtags = hashtags[:5]
else:
r_hashtags = None
return image_url, r_hashtags
def extract_androidauthority_image_url(page_content):
image_url = None
r_hashtags = None
html_content = page_content
match = re.search(r'imageSrcSet="(.*?)"', html_content)
if match:
image_srcset = match.group(1)
urls = re.findall(r'(https?://[^\s,]+\.webp.*?)\s(\d+)w', image_srcset)
for url, width in urls:
if int(width) == 712:
image_url = url
else:
image_url = None
html_content = page_content
match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
if match:
keywords_str = match.group(1)
keywords_list = re.findall(r'"([^"]+)"', keywords_str)
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
if hashtags:
r_hashtags = hashtags[:5]
else:
r_hashtags = None
return image_url, r_hashtags
def extract_theguardian_image_url(page_content):
image_url = None
r_hashtags = None
match = re.search(r'(?<=src=")(https?://.*?\.jpg)', page_content)
if match:
image_url = match.group(0)
image_url = match.group(0) + "?width=620&dpr=1&s=none"
else:
image_url = None
match = re.search(r'"keywords":"([^"]+)"', page_content)
if match and match.group(1):
keywords_str = match.group(1)
# keywords = re.findall(r'\b(\w+)\b', keywords_str)
# hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
keywords = [keyword.strip() for keyword in keywords_str.split(',')]
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
if hashtags:
r_hashtags = hashtags[:5]
else:
r_hashtags = None
return image_url, r_hashtags
def extract_cbc_image_url(page_content):
image_url = None
r_hashtags = None
soup = BeautifulSoup(page_content, 'html.parser')
image_tag = soup.find('img', alt=True)
if image_tag:
image_url = image_tag['src']
else:
image_url = None
start_index = page_content.find('"gs_keywords":["')
if start_index != -1:
start_index += len('"gs_keywords":["')
end_index = page_content.find('"]', start_index)
if end_index != -1:
keywords = page_content[start_index:end_index].split('","')
hashtags = ['#' + keyword for keyword in keywords]
r_hashtags = hashtags[:5]
else:
r_hashtags = None
return image_url, r_hashtags
def extract_techrepublic_image_url(page_content):
image_url = None
r_hashtags = None
pattern = r'<meta property="og:image" content="([^"]+?)"'
match = re.search(pattern, str(page_content))
if match:
image_url = match.group(1)
else:
image_url = None
return None
def extract_time_image_url(page_content):
image_url = None
r_hashtags = None
pattern = r'"image":\[\{"@type":"ImageObject","url":"([^"]+)"'
match = re.search(pattern, page_content)
if match:
image_url = match.group(1)
else:
image_url = None
pattern = r'"keywords":\s*\[("[\w\s]+"(?:,\s*"[^"]+")*)\]'
matches = re.search(pattern, page_content)
if matches:
keywords = [keyword.strip('"') for keyword in matches.group(1).split(',')]
hashtags = ['#' + keyword.replace(' ', '_') for keyword in keywords]
r_hashtags = hashtags[:5]
return image_url, r_hashtags
def extract_wired_image_url(page_content):
image_url = None
r_hashtags = None
html_source = page_content
# Find the index of " 640w" in the HTML source
index = html_source.find("w_640")
if index != -1:
# Find the index of the preceding "https" starting from the found index
start_index = html_source.rfind("https", 0, index)
if start_index != -1:
# Extract the URL
image_url = html_source[start_index:index + 5].strip()
return image_url
else:
print("No image URL found")
return None
else:
print("No image URL found")
return None
def extract_lithub_image_url(page_content):
image_url = None
r_hashtags = None
html_source = page_content
pattern = r'<meta name="twitter:image" content="(\S+)"'
match = re.search(pattern, html_source)
if match:
image_url = match.group(1)
print(image_url)
# input("Press Enter to continue... This was lithub image_url")
return image_url
else:
print("Image URL not found")
''' def extract_androidauthority_hashtags(page_content):
html_content = page_content
match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
if match:
keywords_str = match.group(1)
keywords_list = re.findall(r'"([^"]+)"', keywords_str)
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
if hashtags:
return hashtags[:5]
return [] '''
''' def extract_vancouversun_hashtags(page_content):
match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
if match:
keywords_str = match.group(1)
keywords = re.findall(r'"([^"]+)"', keywords_str)
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
if hashtags:
return hashtags[:5]
return [] '''
''' def extract_francetvinfo_hashtags(page_content):
match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
if match and match.group(1):
keywords_str = match.group(1)
keywords_unescaped = html.unescape(keywords_str)
keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
keywords = re.findall(r'([^,]+)', keywords_decoded)
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
if hashtags:
return hashtags[:5]
return [] '''
''' def extract_theguardian_hashtags(page_content):
match = re.search(r'"keywords":"([^"]+)"', page_content)
if match and match.group(1):
keywords_str = match.group(1)
# keywords = re.findall(r'\b(\w+)\b', keywords_str)
# hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
keywords = [keyword.strip() for keyword in keywords_str.split(',')]
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
if hashtags:
return hashtags[:5]
return [] '''
''' def extract_bbc_hashtags(page_content):
soup = BeautifulSoup(page_content, 'html.parser')
keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
keywords = [tag.text for tag in keyword_tags]
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
if hashtags:
return hashtags[:5]
return [] '''
''' def extract_cbc_hashtags(source_content):
start_index = source_content.find('"gs_keywords":["')
if start_index != -1:
start_index += len('"gs_keywords":["')
end_index = source_content.find('"]', start_index)
if end_index != -1:
keywords = source_content[start_index:end_index].split('","')
hashtags = ['#' + keyword for keyword in keywords]
return hashtags[:5]
return [] '''
''' def extract_time_hashtags(source_content):
pattern = r'"keywords":\s*\[("[\w\s]+"(?:,\s*"[^"]+")*)\]'
matches = re.search(pattern, source_content)
if matches:
keywords = [keyword.strip('"') for keyword in matches.group(1).split(',')]
hashtags = ['#' + keyword.replace(' ', '_') for keyword in keywords]
return hashtags[:5]
return [] '''