diff --git a/image_utils.py b/image_utils.py
new file mode 100644
index 0000000..b468804
--- /dev/null
+++ b/image_utils.py
@@ -0,0 +1,284 @@
+import re
+import requests
+import html
+import codecs
+from bs4 import BeautifulSoup
+
+def extract_francetvinfo_image_url(page_content):
+ image_url = None
+ r_hashtags = None
+
+
+ html_source = page_content
+
+ # Find the index of " 720w" in the HTML source
+ index = html_source.find(" 720w")
+
+ if index != -1:
+ # Find the index of the preceding "https" starting from the found index
+ start_index = html_source.rfind("https", 0, index)
+
+ if start_index != -1:
+ # Extract the URL
+ image_url = html_source[start_index:index].strip()
+ else:
+ image_url = None
+ else:
+ image_url = None
+
+ # Now extract hashtags
+ match = re.search(r'"keywords":\s*\"([^\"]+)\"', page_content)
+ if match and match.group(1):
+ keywords_str = match.group(1)
+ keywords_unescaped = html.unescape(keywords_str)
+ keywords_decoded = keywords_unescaped.encode('utf-8').decode('unicode_escape')
+ keywords = re.findall(r'([^,]+)', keywords_decoded)
+ hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
+ if hashtags:
+ r_hashtags = hashtags[:5]
+ else:
+ r_hashtags = None
+ else:
+ r_hashtags = None
+ return image_url, r_hashtags
+
+
+def extract_vancouver_image_url(page_content):
+ image_url = None
+ r_hashtags = None
+
+ context = page_content
+ regex_pattern = r'imagesrcset="([^"]+?),'
+ matches = re.search(regex_pattern, context)
+
+ if matches:
+ image_url = matches.group(1)
+ else:
+ image_url = None
+
+ # Now extract hashtags
+ match = re.search(r'"keywords":\s*\[([^\]]+)\]', page_content)
+ if match:
+ keywords_str = match.group(1)
+ keywords = re.findall(r'"([^"]+)"', keywords_str)
+ hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
+ if hashtags:
+ r_hashtags = hashtags[:5]
+ else:
+ r_hashtags = None
+
+ return image_url, r_hashtags
+
+
+def extract_bbc_image_url(page_content):
+
+ image_url = None
+ r_hashtags = None
+ soup = BeautifulSoup(page_content, 'html.parser')
+
+ img_tag = soup.find('img')
+ image_url = None
+ if img_tag:
+ srcset = img_tag.get('srcset')
+ urls = re.findall(r'(https?://[^\s,]+\.jpg) (\d+)w', str(srcset))
+ urls = sorted(urls, key=lambda x: int(x[1]), reverse=True) # Sort URLs by width in descending order
+
+ for url, width in urls:
+ if int(width) <= 480:
+ print(url)
+ image_url = url
+ else:
+ image_url = None
+
+ soup = BeautifulSoup(page_content, 'html.parser')
+ keyword_tags = soup.find_all('a', class_='ssrcss-w6az1r-StyledLink ed0g1kj0')
+ keywords = [tag.text for tag in keyword_tags]
+ hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
+ if hashtags:
+ r_hashtags = hashtags[:5]
+ else:
+ r_hashtags = None
+
+ return image_url, r_hashtags
+
+
+
+def extract_androidauthority_image_url(page_content):
+
+ image_url = None
+ r_hashtags = None
+ html_content = page_content
+
+ match = re.search(r'imageSrcSet="(.*?)"', html_content)
+ if match:
+ image_srcset = match.group(1)
+ urls = re.findall(r'(https?://[^\s,]+\.webp.*?)\s(\d+)w', image_srcset)
+ for url, width in urls:
+ if int(width) == 712:
+ image_url = url
+
+
+ html_content = page_content
+ match = re.search(r'"keywords":\s*\[([^\]]+)\]', html_content)
+ if match:
+ keywords_str = match.group(1)
+ keywords_list = re.findall(r'"([^"]+)"', keywords_str)
+ hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords_list]
+ if hashtags:
+ r_hashtags = hashtags[:5]
+ else:
+ r_hashtags = None
+
+ return image_url, r_hashtags
+
+
+def extract_theguardian_image_url(page_content):
+
+ image_url = None
+ r_hashtags = None
+
+ match = re.search(r'(?<=src=")(https?://.*?\.jpg)', page_content)
+ if match:
+ image_url = match.group(0)
+ image_url = match.group(0) + "?width=620&dpr=1&s=none"
+ else:
+ image_url = None
+
+ match = re.search(r'"keywords":"([^"]+)"', page_content)
+ if match and match.group(1):
+ keywords_str = match.group(1)
+ # keywords = re.findall(r'\b(\w+)\b', keywords_str)
+ # hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
+ keywords = [keyword.strip() for keyword in keywords_str.split(',')]
+ hashtags = ['#' + keyword.replace(' ', '') for keyword in keywords]
+ if hashtags:
+ r_hashtags = hashtags[:5]
+ else:
+ r_hashtags = None
+ return image_url, r_hashtags
+
+
+def extract_cbc_image_url(page_content):
+
+ image_url = None
+ r_hashtags = None
+
+ soup = BeautifulSoup(page_content, 'html.parser')
+
+ image_tag = soup.find('img', alt=True)
+ if image_tag:
+ image_url = image_tag['src']
+ else:
+ image_url = None
+
+ start_index = page_content.find('"gs_keywords":["')
+ if start_index != -1:
+ start_index += len('"gs_keywords":["')
+ end_index = page_content.find('"]', start_index)
+ if end_index != -1:
+ keywords = page_content[start_index:end_index].split('","')
+ hashtags = ['#' + keyword for keyword in keywords]
+ r_hashtags = hashtags[:5]
+ else:
+ r_hashtags = None
+ return image_url, r_hashtags
+
+def extract_techrepublic_image_url(page_content):
+
+ image_url = None
+ r_hashtags = None
+
+ pattern = r'', page_content)
+ if keywords_str and keywords_str.group(1):
+ keywords = keywords_str.group(1).split(',')
+ hashtags = ['#' + keyword.strip().replace(' ', '') for keyword in keywords]
+
+ if hashtags:
+ r_hashtags = hashtags[:5]
+ else:
+ r_hashtags = None
+
+ return image_url, r_hashtags
+
+
+def extract_lithub_image_url(page_content):
+
+ image_url = None
+ r_hashtags = None
+
+ html_source = page_content
+
+ pattern = r'