348 lines
10 KiB
Python
348 lines
10 KiB
Python
import re
|
|
import requests
|
|
import html
|
|
import codecs
|
|
from bs4 import BeautifulSoup
|
|
|
|
def extract_francetvinfo_image_url(page_content):
|
|
image_url = None
|
|
r_hashtags = None
|
|
|
|
|
|
html_source = page_content
|
|
|
|
# Find the index of " 720w" in the HTML source
|
|
index = html_source.find(" 720w")
|
|
|
|
if index != -1:
|
|
# Find the index of the preceding "https" starting from the found index
|
|
start_index = html_source.rfind("https", 0, index)
|
|
|
|
if start_index != -1:
|
|
# Extract the URL
|
|
image_url = html_source[start_index:index].strip()
|
|
else:
|
|
image_url = None
|
|
else:
|
|
image_url = None
|
|
|
|
# Now extract hashtags
|
|
match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
|
|
if match and match.group(1):
|
|
keywords_str = match.group(1)
|
|
keywords_unescaped = html.unescape(keywords_str)
|
|
keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
|
|
keywords = re.findall(r'([^,]+)', keywords_decoded)
|
|
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
|
if hashtags:
|
|
r_hashtags = hashtags[:5]
|
|
else:
|
|
r_hashtags = None
|
|
else:
|
|
r_hashtags = None
|
|
return image_url, r_hashtags
|
|
|
|
|
|
def extract_vancouver_image_url(page_content):
|
|
image_url = None
|
|
r_hashtags = None
|
|
|
|
context = page_content
|
|
regex_pattern = r'imagesrcset="([^"]+?),'
|
|
matches = re.search(regex_pattern, context)
|
|
|
|
if matches:
|
|
image_url = matches.group(1)
|
|
else:
|
|
image_url = None
|
|
|
|
# Now extract hashtags
|
|
match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
|
|
if match:
|
|
keywords_str = match.group(1)
|
|
keywords = re.findall(r'"([^"]+)"', keywords_str)
|
|
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
|
if hashtags:
|
|
r_hashtags = hashtags[:5]
|
|
else:
|
|
r_hashtags = None
|
|
|
|
return image_url, r_hashtags
|
|
|
|
|
|
def extract_bbc_image_url(page_content):
|
|
|
|
image_url = None
|
|
r_hashtags = None
|
|
soup = BeautifulSoup(page_content, 'html.parser')
|
|
|
|
img_tag = soup.find('img')
|
|
image_url = None
|
|
if img_tag:
|
|
srcset = img_tag.get('srcset')
|
|
urls = re.findall(r'(https?://[^\s,]+\.jpg) (\d+)w', str(srcset))
|
|
urls = sorted(urls, key=lambda x: int(x[1]), reverse=True) # Sort URLs by width in descending order
|
|
|
|
for url, width in urls:
|
|
if int(width) <= 480:
|
|
print(url)
|
|
image_url = url
|
|
else:
|
|
image_url = None
|
|
|
|
soup = BeautifulSoup(page_content, 'html.parser')
|
|
keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
|
|
keywords = [tag.text for tag in keyword_tags]
|
|
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
|
if hashtags:
|
|
r_hashtags = hashtags[:5]
|
|
else:
|
|
r_hashtags = None
|
|
|
|
return image_url, r_hashtags
|
|
|
|
|
|
|
|
def extract_androidauthority_image_url(page_content):
|
|
|
|
image_url = None
|
|
r_hashtags = None
|
|
html_content = page_content
|
|
|
|
match = re.search(r'imageSrcSet="(.*?)"', html_content)
|
|
if match:
|
|
image_srcset = match.group(1)
|
|
urls = re.findall(r'(https?://[^\s,]+\.webp.*?)\s(\d+)w', image_srcset)
|
|
for url, width in urls:
|
|
if int(width) == 712:
|
|
image_url = url
|
|
else:
|
|
image_url = None
|
|
|
|
html_content = page_content
|
|
match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
|
|
if match:
|
|
keywords_str = match.group(1)
|
|
keywords_list = re.findall(r'"([^"]+)"', keywords_str)
|
|
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
|
|
if hashtags:
|
|
r_hashtags = hashtags[:5]
|
|
else:
|
|
r_hashtags = None
|
|
|
|
return image_url, r_hashtags
|
|
|
|
|
|
def extract_theguardian_image_url(page_content):
|
|
|
|
image_url = None
|
|
r_hashtags = None
|
|
|
|
match = re.search(r'(?<=src=")(https?://.*?\.jpg)', page_content)
|
|
if match:
|
|
image_url = match.group(0)
|
|
image_url = match.group(0) + "?width=620&dpr=1&s=none"
|
|
else:
|
|
image_url = None
|
|
|
|
match = re.search(r'"keywords":"([^"]+)"', page_content)
|
|
if match and match.group(1):
|
|
keywords_str = match.group(1)
|
|
# keywords = re.findall(r'\b(\w+)\b', keywords_str)
|
|
# hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
|
keywords = [keyword.strip() for keyword in keywords_str.split(',')]
|
|
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
|
if hashtags:
|
|
r_hashtags = hashtags[:5]
|
|
else:
|
|
r_hashtags = None
|
|
return image_url, r_hashtags
|
|
|
|
|
|
def extract_cbc_image_url(page_content):
|
|
|
|
image_url = None
|
|
r_hashtags = None
|
|
|
|
soup = BeautifulSoup(page_content, 'html.parser')
|
|
|
|
image_tag = soup.find('img', alt=True)
|
|
if image_tag:
|
|
image_url = image_tag['src']
|
|
else:
|
|
image_url = None
|
|
|
|
start_index = page_content.find('"gs_keywords":["')
|
|
if start_index != -1:
|
|
start_index += len('"gs_keywords":["')
|
|
end_index = page_content.find('"]', start_index)
|
|
if end_index != -1:
|
|
keywords = page_content[start_index:end_index].split('","')
|
|
hashtags = ['#' + keyword for keyword in keywords]
|
|
r_hashtags = hashtags[:5]
|
|
else:
|
|
r_hashtags = None
|
|
return image_url, r_hashtags
|
|
|
|
def extract_techrepublic_image_url(page_content):
|
|
|
|
image_url = None
|
|
r_hashtags = None
|
|
|
|
pattern = r'<meta property="og:image" content="([^"]+?)"'
|
|
match = re.search(pattern, str(page_content))
|
|
if match:
|
|
image_url = match.group(1)
|
|
else:
|
|
image_url = None
|
|
|
|
|
|
return None
|
|
|
|
def extract_time_image_url(page_content):
|
|
|
|
image_url = None
|
|
r_hashtags = None
|
|
|
|
pattern = r'"image":\[\{"@type":"ImageObject","url":"([^"]+)"'
|
|
match = re.search(pattern, page_content)
|
|
if match:
|
|
image_url = match.group(1)
|
|
else:
|
|
image_url = None
|
|
|
|
pattern = r'"keywords":\s*\[("[\w\s]+"(?:,\s*"[^"]+")*)\]'
|
|
matches = re.search(pattern, page_content)
|
|
|
|
if matches:
|
|
keywords = [keyword.strip('"') for keyword in matches.group(1).split(',')]
|
|
hashtags = ['#' + keyword.replace(' ', '_') for keyword in keywords]
|
|
r_hashtags = hashtags[:5]
|
|
return image_url, r_hashtags
|
|
|
|
|
|
def extract_wired_image_url(page_content):
|
|
|
|
image_url = None
|
|
r_hashtags = None
|
|
|
|
html_source = page_content
|
|
|
|
# Find the index of " 640w" in the HTML source
|
|
index = html_source.find("w_640")
|
|
|
|
|
|
if index != -1:
|
|
# Find the index of the preceding "https" starting from the found index
|
|
start_index = html_source.rfind("https", 0, index)
|
|
|
|
if start_index != -1:
|
|
# Extract the URL
|
|
image_url = html_source[start_index:index + 5].strip()
|
|
return image_url
|
|
|
|
else:
|
|
print("No image URL found")
|
|
return None
|
|
else:
|
|
print("No image URL found")
|
|
return None
|
|
|
|
|
|
|
|
def extract_lithub_image_url(page_content):
|
|
|
|
image_url = None
|
|
r_hashtags = None
|
|
|
|
html_source = page_content
|
|
|
|
pattern = r'<meta name="twitter:image" content="(\S+)"'
|
|
match = re.search(pattern, html_source)
|
|
if match:
|
|
image_url = match.group(1)
|
|
print(image_url)
|
|
# input("Press Enter to continue... This was lithub image_url")
|
|
return image_url
|
|
else:
|
|
print("Image URL not found")
|
|
|
|
|
|
''' def extract_androidauthority_hashtags(page_content):
|
|
html_content = page_content
|
|
|
|
match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
|
|
if match:
|
|
keywords_str = match.group(1)
|
|
keywords_list = re.findall(r'"([^"]+)"', keywords_str)
|
|
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
|
|
if hashtags:
|
|
return hashtags[:5]
|
|
return [] '''
|
|
|
|
''' def extract_vancouversun_hashtags(page_content):
|
|
|
|
match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
|
|
if match:
|
|
keywords_str = match.group(1)
|
|
keywords = re.findall(r'"([^"]+)"', keywords_str)
|
|
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
|
if hashtags:
|
|
return hashtags[:5]
|
|
return [] '''
|
|
|
|
''' def extract_francetvinfo_hashtags(page_content):
|
|
match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
|
|
if match and match.group(1):
|
|
keywords_str = match.group(1)
|
|
keywords_unescaped = html.unescape(keywords_str)
|
|
keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
|
|
keywords = re.findall(r'([^,]+)', keywords_decoded)
|
|
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
|
if hashtags:
|
|
return hashtags[:5]
|
|
return [] '''
|
|
|
|
''' def extract_theguardian_hashtags(page_content):
|
|
match = re.search(r'"keywords":"([^"]+)"', page_content)
|
|
if match and match.group(1):
|
|
keywords_str = match.group(1)
|
|
# keywords = re.findall(r'\b(\w+)\b', keywords_str)
|
|
# hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
|
keywords = [keyword.strip() for keyword in keywords_str.split(',')]
|
|
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
|
if hashtags:
|
|
return hashtags[:5]
|
|
return [] '''
|
|
|
|
|
|
''' def extract_bbc_hashtags(page_content):
|
|
soup = BeautifulSoup(page_content, 'html.parser')
|
|
keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
|
|
keywords = [tag.text for tag in keyword_tags]
|
|
hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
|
|
if hashtags:
|
|
return hashtags[:5]
|
|
return [] '''
|
|
|
|
''' def extract_cbc_hashtags(source_content):
|
|
start_index = source_content.find('"gs_keywords":["')
|
|
if start_index != -1:
|
|
start_index += len('"gs_keywords":["')
|
|
end_index = source_content.find('"]', start_index)
|
|
if end_index != -1:
|
|
keywords = source_content[start_index:end_index].split('","')
|
|
hashtags = ['#' + keyword for keyword in keywords]
|
|
return hashtags[:5]
|
|
return [] '''
|
|
|
|
''' def extract_time_hashtags(source_content):
|
|
pattern = r'"keywords":\s*\[("[\w\s]+"(?:,\s*"[^"]+")*)\]'
|
|
matches = re.search(pattern, source_content)
|
|
|
|
if matches:
|
|
keywords = [keyword.strip('"') for keyword in matches.group(1).split(',')]
|
|
hashtags = ['#' + keyword.replace(' ', '_') for keyword in keywords]
|
|
return hashtags[:5]
|
|
|
|
return [] ''' |