Delete image_utils.py
This commit is contained in:
parent
11c3d1c6b7
commit
666565411c
348
image_utils.py
348
image_utils.py
@ -1,348 +0,0 @@
|
||||
import re
|
||||
import requests
|
||||
import html
|
||||
import codecs
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def extract_francetvinfo_image_url(page_content):
|
||||
image_url = None
|
||||
r_hashtags = None
|
||||
|
||||
|
||||
html_source = page_content
|
||||
|
||||
# Find the index of " 720w" in the HTML source
|
||||
index = html_source.find(" 720w")
|
||||
|
||||
if index != -1:
|
||||
# Find the index of the preceding "https" starting from the found index
|
||||
start_index = html_source.rfind("https", 0, index)
|
||||
|
||||
if start_index != -1:
|
||||
# Extract the URL
|
||||
image_url = html_source[start_index:index].strip()
|
||||
else:
|
||||
image_url = None
|
||||
else:
|
||||
image_url = None
|
||||
|
||||
# Now extract hashtags
|
||||
match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
|
||||
if match and match.group(1):
|
||||
keywords_str = match.group(1)
|
||||
keywords_unescaped = html.unescape(keywords_str)
|
||||
keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
|
||||
keywords = re.findall(r'([^,]+)', keywords_decoded)
|
||||
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
||||
if hashtags:
|
||||
r_hashtags = hashtags[:5]
|
||||
else:
|
||||
r_hashtags = None
|
||||
else:
|
||||
r_hashtags = None
|
||||
return image_url, r_hashtags
|
||||
|
||||
|
||||
def extract_vancouver_image_url(page_content):
|
||||
image_url = None
|
||||
r_hashtags = None
|
||||
|
||||
context = page_content
|
||||
regex_pattern = r'imagesrcset="([^"]+?),'
|
||||
matches = re.search(regex_pattern, context)
|
||||
|
||||
if matches:
|
||||
image_url = matches.group(1)
|
||||
else:
|
||||
image_url = None
|
||||
|
||||
# Now extract hashtags
|
||||
match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
|
||||
if match:
|
||||
keywords_str = match.group(1)
|
||||
keywords = re.findall(r'"([^"]+)"', keywords_str)
|
||||
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
||||
if hashtags:
|
||||
r_hashtags = hashtags[:5]
|
||||
else:
|
||||
r_hashtags = None
|
||||
|
||||
return image_url, r_hashtags
|
||||
|
||||
|
||||
def extract_bbc_image_url(page_content):
|
||||
|
||||
image_url = None
|
||||
r_hashtags = None
|
||||
soup = BeautifulSoup(page_content, 'html.parser')
|
||||
|
||||
img_tag = soup.find('img')
|
||||
image_url = None
|
||||
if img_tag:
|
||||
srcset = img_tag.get('srcset')
|
||||
urls = re.findall(r'(https?://[^\s,]+\.jpg) (\d+)w', str(srcset))
|
||||
urls = sorted(urls, key=lambda x: int(x[1]), reverse=True) # Sort URLs by width in descending order
|
||||
|
||||
for url, width in urls:
|
||||
if int(width) <= 480:
|
||||
print(url)
|
||||
image_url = url
|
||||
else:
|
||||
image_url = None
|
||||
|
||||
soup = BeautifulSoup(page_content, 'html.parser')
|
||||
keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
|
||||
keywords = [tag.text for tag in keyword_tags]
|
||||
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
||||
if hashtags:
|
||||
r_hashtags = hashtags[:5]
|
||||
else:
|
||||
r_hashtags = None
|
||||
|
||||
return image_url, r_hashtags
|
||||
|
||||
|
||||
|
||||
def extract_androidauthority_image_url(page_content):
|
||||
|
||||
image_url = None
|
||||
r_hashtags = None
|
||||
html_content = page_content
|
||||
|
||||
match = re.search(r'imageSrcSet="(.*?)"', html_content)
|
||||
if match:
|
||||
image_srcset = match.group(1)
|
||||
urls = re.findall(r'(https?://[^\s,]+\.webp.*?)\s(\d+)w', image_srcset)
|
||||
for url, width in urls:
|
||||
if int(width) == 712:
|
||||
image_url = url
|
||||
else:
|
||||
image_url = None
|
||||
|
||||
html_content = page_content
|
||||
match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
|
||||
if match:
|
||||
keywords_str = match.group(1)
|
||||
keywords_list = re.findall(r'"([^"]+)"', keywords_str)
|
||||
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
|
||||
if hashtags:
|
||||
r_hashtags = hashtags[:5]
|
||||
else:
|
||||
r_hashtags = None
|
||||
|
||||
return image_url, r_hashtags
|
||||
|
||||
|
||||
def extract_theguardian_image_url(page_content):
|
||||
|
||||
image_url = None
|
||||
r_hashtags = None
|
||||
|
||||
match = re.search(r'(?<=src=")(https?://.*?\.jpg)', page_content)
|
||||
if match:
|
||||
image_url = match.group(0)
|
||||
image_url = match.group(0) + "?width=620&dpr=1&s=none"
|
||||
else:
|
||||
image_url = None
|
||||
|
||||
match = re.search(r'"keywords":"([^"]+)"', page_content)
|
||||
if match and match.group(1):
|
||||
keywords_str = match.group(1)
|
||||
# keywords = re.findall(r'\b(\w+)\b', keywords_str)
|
||||
# hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
||||
keywords = [keyword.strip() for keyword in keywords_str.split(',')]
|
||||
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
||||
if hashtags:
|
||||
r_hashtags = hashtags[:5]
|
||||
else:
|
||||
r_hashtags = None
|
||||
return image_url, r_hashtags
|
||||
|
||||
|
||||
def extract_cbc_image_url(page_content):
|
||||
|
||||
image_url = None
|
||||
r_hashtags = None
|
||||
|
||||
soup = BeautifulSoup(page_content, 'html.parser')
|
||||
|
||||
image_tag = soup.find('img', alt=True)
|
||||
if image_tag:
|
||||
image_url = image_tag['src']
|
||||
else:
|
||||
image_url = None
|
||||
|
||||
start_index = page_content.find('"gs_keywords":["')
|
||||
if start_index != -1:
|
||||
start_index += len('"gs_keywords":["')
|
||||
end_index = page_content.find('"]', start_index)
|
||||
if end_index != -1:
|
||||
keywords = page_content[start_index:end_index].split('","')
|
||||
hashtags = ['#' + keyword for keyword in keywords]
|
||||
r_hashtags = hashtags[:5]
|
||||
else:
|
||||
r_hashtags = None
|
||||
return image_url, r_hashtags
|
||||
|
||||
def extract_techrepublic_image_url(page_content):
|
||||
|
||||
image_url = None
|
||||
r_hashtags = None
|
||||
|
||||
pattern = r'<meta property="og:image" content="([^"]+?)"'
|
||||
match = re.search(pattern, str(page_content))
|
||||
if match:
|
||||
image_url = match.group(1)
|
||||
else:
|
||||
image_url = None
|
||||
|
||||
|
||||
return None
|
||||
|
||||
def extract_time_image_url(page_content):
|
||||
|
||||
image_url = None
|
||||
r_hashtags = None
|
||||
|
||||
pattern = r'"image":\[\{"@type":"ImageObject","url":"([^"]+)"'
|
||||
match = re.search(pattern, page_content)
|
||||
if match:
|
||||
image_url = match.group(1)
|
||||
else:
|
||||
image_url = None
|
||||
|
||||
pattern = r'"keywords":\s*\[("[\w\s]+"(?:,\s*"[^"]+")*)\]'
|
||||
matches = re.search(pattern, page_content)
|
||||
|
||||
if matches:
|
||||
keywords = [keyword.strip('"') for keyword in matches.group(1).split(',')]
|
||||
hashtags = ['#' + keyword.replace(' ', '_') for keyword in keywords]
|
||||
r_hashtags = hashtags[:5]
|
||||
return image_url, r_hashtags
|
||||
|
||||
|
||||
def extract_wired_image_url(page_content):
|
||||
|
||||
image_url = None
|
||||
r_hashtags = None
|
||||
|
||||
html_source = page_content
|
||||
|
||||
# Find the index of " 640w" in the HTML source
|
||||
index = html_source.find("w_640")
|
||||
|
||||
|
||||
if index != -1:
|
||||
# Find the index of the preceding "https" starting from the found index
|
||||
start_index = html_source.rfind("https", 0, index)
|
||||
|
||||
if start_index != -1:
|
||||
# Extract the URL
|
||||
image_url = html_source[start_index:index + 5].strip()
|
||||
return image_url
|
||||
|
||||
else:
|
||||
print("No image URL found")
|
||||
return None
|
||||
else:
|
||||
print("No image URL found")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def extract_lithub_image_url(page_content):
|
||||
|
||||
image_url = None
|
||||
r_hashtags = None
|
||||
|
||||
html_source = page_content
|
||||
|
||||
pattern = r'<meta name="twitter:image" content="(\S+)"'
|
||||
match = re.search(pattern, html_source)
|
||||
if match:
|
||||
image_url = match.group(1)
|
||||
print(image_url)
|
||||
# input("Press Enter to continue... This was lithub image_url")
|
||||
return image_url
|
||||
else:
|
||||
print("Image URL not found")
|
||||
|
||||
|
||||
''' def extract_androidauthority_hashtags(page_content):
|
||||
html_content = page_content
|
||||
|
||||
match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
|
||||
if match:
|
||||
keywords_str = match.group(1)
|
||||
keywords_list = re.findall(r'"([^"]+)"', keywords_str)
|
||||
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
|
||||
if hashtags:
|
||||
return hashtags[:5]
|
||||
return [] '''
|
||||
|
||||
''' def extract_vancouversun_hashtags(page_content):
|
||||
|
||||
match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
|
||||
if match:
|
||||
keywords_str = match.group(1)
|
||||
keywords = re.findall(r'"([^"]+)"', keywords_str)
|
||||
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
||||
if hashtags:
|
||||
return hashtags[:5]
|
||||
return [] '''
|
||||
|
||||
''' def extract_francetvinfo_hashtags(page_content):
|
||||
match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
|
||||
if match and match.group(1):
|
||||
keywords_str = match.group(1)
|
||||
keywords_unescaped = html.unescape(keywords_str)
|
||||
keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
|
||||
keywords = re.findall(r'([^,]+)', keywords_decoded)
|
||||
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
||||
if hashtags:
|
||||
return hashtags[:5]
|
||||
return [] '''
|
||||
|
||||
''' def extract_theguardian_hashtags(page_content):
|
||||
match = re.search(r'"keywords":"([^"]+)"', page_content)
|
||||
if match and match.group(1):
|
||||
keywords_str = match.group(1)
|
||||
# keywords = re.findall(r'\b(\w+)\b', keywords_str)
|
||||
# hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
||||
keywords = [keyword.strip() for keyword in keywords_str.split(',')]
|
||||
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
||||
if hashtags:
|
||||
return hashtags[:5]
|
||||
return [] '''
|
||||
|
||||
|
||||
''' def extract_bbc_hashtags(page_content):
|
||||
soup = BeautifulSoup(page_content, 'html.parser')
|
||||
keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
|
||||
keywords = [tag.text for tag in keyword_tags]
|
||||
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
||||
if hashtags:
|
||||
return hashtags[:5]
|
||||
return [] '''
|
||||
|
||||
''' def extract_cbc_hashtags(source_content):
|
||||
start_index = source_content.find('"gs_keywords":["')
|
||||
if start_index != -1:
|
||||
start_index += len('"gs_keywords":["')
|
||||
end_index = source_content.find('"]', start_index)
|
||||
if end_index != -1:
|
||||
keywords = source_content[start_index:end_index].split('","')
|
||||
hashtags = ['#' + keyword for keyword in keywords]
|
||||
return hashtags[:5]
|
||||
return [] '''
|
||||
|
||||
''' def extract_time_hashtags(source_content):
|
||||
pattern = r'"keywords":\s*\[("[\w\s]+"(?:,\s*"[^"]+")*)\]'
|
||||
matches = re.search(pattern, source_content)
|
||||
|
||||
if matches:
|
||||
keywords = [keyword.strip('"') for keyword in matches.group(1).split(',')]
|
||||
hashtags = ['#' + keyword.replace(' ', '_') for keyword in keywords]
|
||||
return hashtags[:5]
|
||||
|
||||
return [] '''
|
||||
Loading…
Reference in New Issue
Block a user