import requests import image_utils import os import io import logging import cv2 import imageio import shutil import subprocess import time import xml.etree.ElementTree as ET from urllib.parse import urlparse from pathlib import Path from bs4 import BeautifulSoup from PIL import Image, ImageDraw, ImageFont from urllib.parse import urlparse RSS_URL = 'https://fresh.franv.site/i/?a=rss&user=fossilfranv&token=sdfggf456456465xcvxcvxvc&hours=168' MASTODON_TOKEN = 'J65EiYQMpc-hY3CaUJaQPHdXxV7-KiKZjlr0QPESlVQ' MASTODON_HOST = 'https://mast.airdog.site' search_terms = ["slashdot", "time", "bbc", "cbc", "francetvinfo", "lithub", "theguardian", "vancouversun", "techrepublic", "ycombinator", "slashdot", "time", "spiegel", "wired", "androidauthority"] # Define the logo_path as it is going to be modified in the process_news function logo_path = "" # Define image_path the same way for convenience. image_path = "/home/franv/mast_bot/images/11.jpg" def post_to_mastodon(source, title, description, link, image_url, hashtags): global logo_path image_path = "/home/franv/mast_bot/images/11.jpg" # load_images_path = "/home/franv/mast_bot/images/" media_ids = [] # Write image to folder for later retrieval if image_url and is_valid_url(image_url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} img_data = requests.get(image_url, headers=headers, timeout=10).content with open(image_path, 'wb') as handler: handler.write(img_data) ## Add bottom band and logo to image headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} new_image = add_bottom_band_with_logo(image_path, 0.15, (220, 220, 220), logo_path) if not new_image: new_image = Image.open("/home/franv/mast_bot/logos/news.jpg") new_image.save(image_path) else: # If no image just replace 11.jpg with default image temp_image = Image.open("/home/franv/mast_bot/logos/news.jpg") temp_image.save("/home/franv/mast_bot/images/11.jpg") IMG_FILES = [filename for filename in os.listdir(load_images_path) if os.path.isfile(os.path.join(load_images_path, filename))] # Originally wanted to post many images but now only 1 image for file in IMG_FILES: files = {'file': open(os.path.join(load_images_path, file), 'rb')} url = f"{MASTODON_HOST}/api/v1/media" r = requests.post(url, files=files, headers={'Authorization': f'Bearer {MASTODON_TOKEN}'}) response_json = r.json() if r.status_code == 200: media_id = response_json['id'] media_ids.append(media_id) # Compose status_text which, with images, is the only content posted by mastodon if source and title and description and link: status_text = source.upper() + "\n" + "\n" + title.upper() + "\n" + "\n" + " " + description + "\n" + "\n" + link + "\n" + "\n" + str(hashtags) data = { "status": status_text, "media_ids[]": media_ids, "description": description, "link": link } # Post to mastodon url = f"{MASTODON_HOST}/api/v1/statuses" r = requests.post(url, data=data, headers={'Authorization': f'Bearer {MASTODON_TOKEN}'}) json_data = r.json() return None def read_news(): # Make a request to the RSS feed URL response = requests.get(RSS_URL) # Check if the request was successful (status code 200) if response.status_code == 200: # Parse the XML content using ElementTree root = ET.fromstring(response.content) items = list(root.findall('.//item')) return items else: return None def get_news(items): # Initialize main counter for the loop main_counter = 0 # Iterate over each item element in the XML for i, item in enumerate(items): # Use XPath to extract the desired information from each item title_element = item.find('.//title') title = title_element.text if title_element is not None else None description_element = item.find('.//description') description = description_element.text if description_element is not None else None # Clean the description of non-printable characters soup = BeautifulSoup(description, 'html.parser') description = soup.get_text()[:250] link_element = item.find('.//link') link = link_element.text if link_element is not None else None enclosure_element = item.find('.//enclosure') enclosure = enclosure_element.get('url') if enclosure_element is not None else None media_ids = [] date_element = item.find('.//pubDate') date = date_element.text if date_element is not None else None displaydate_element = item.find('.//displaydate') displaydate = displaydate_element.text if displaydate_element is not None else None # Create a newsInfo object with the extracted information newsInfo = { 'title': title, 'description': description, 'link': link, 'enclosure': enclosure, 'media_ids': media_ids, 'date': date, 'displaydate': displaydate, 'image_url' : None, 'hashtags' : None # Add more fields as needed } # Add line feeds to the post to make more legible print("\n" * 2) # Extract the source from the newsInfo.link URL # The source is going to be needed in process_news url = newsInfo['link'] parsed_url = urlparse(url) found_term = None source = None # Search for term in url term_index = 0 while term_index < len(search_terms) and not found_term: term = search_terms[term_index] if term in parsed_url.netloc.lower(): found_term = term term_index += 1 if found_term is not None: source = found_term else: # Look in description description = newsInfo['description'][:50].lower() for term in search_terms: if term in description: found_term = term source = found_term # Get page content to process_news try: response = requests.get(newsInfo['link'], headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}, timeout = 5) print(response) if response.status_code == 200: page_content = response.text if process_news(page_content, source, newsInfo): if not newsInfo['image_url']: newsInfo['image_url'] = None post_to_mastodon(source, newsInfo['title'], newsInfo['description'], newsInfo['link'], newsInfo['image_url'], newsInfo['hashtags']) except requests.Timeout: continue print(newsInfo) # Delay posting so as not to overwhelm mastodon if main_counter < 6: time.sleep(30) main_counter += 1 else: main_counter = 0 time.sleep(300) # return source, newsInfo # input("Press Enter to continue...") # return source, newsInfo # Process the news according to source def process_news(page_content, source, newsInfo): global logo_path if source == "androidauthority": image_url, r_hastags = image_utils.extract_androidauthority_image_url(page_content) newsInfo['image_url'] = image_url newsInfo['hashtags'] = r_hastags logo_path = "/home/franv/mast_bot/logos/" + "androidauthority.jpg" # Perform actions specific to source1 # newsInfo['additional_field1'] = "Value for source1" print(source) # Modify other fields in newsInfo as needed elif source == "bbc": image_url, r_hastags = image_utils.extract_bbc_image_url(page_content) newsInfo['image_url'] = image_url newsInfo['hashtags'] = r_hastags logo_path = "/home/franv/mast_bot/logos/" + "bbc.jpg" elif source == "cbc": image_url, r_hastags = image_utils.extract_cbc_image_url(page_content) newsInfo['image_url'] = image_url newsInfo['hashtags'] = r_hastags logo_path = "/home/franv/mast_bot/logos/" + "cbc.jpg" elif source == "francetvinfo": image_url, r_hastags = image_utils.extract_francetvinfo_image_url(page_content) newsInfo['image_url'] = image_url newsInfo['hashtags'] = r_hastags logo_path = "/home/franv/mast_bot/logos/" + "franceinfo.jpg" elif source == "theguardian": image_url, r_hastags = image_utils.extract_theguardian_image_url(page_content) newsInfo['image_url'] = image_url newsInfo['hashtags'] = r_hastags logo_path = "/home/franv/mast_bot/logos/" + "theguardian.jpg" elif source == "vancouversun": image_url, r_hastags = image_utils.extract_vancouver_image_url(page_content) newsInfo['image_url'] = image_url newsInfo['hashtags'] = r_hastags logo_path = "/home/franv/mast_bot/logos/" + "vancouversun.jpg" elif source == "techrepublic": # Perform actions specific to source3 newsInfo['image_url'] = image_utils.extract_techrepublic_image_url(page_content) logo_path = "/home/franv/mast_bot/logos/" + "techrepublic.jpg" elif source == "time": image_url, r_hastags = image_utils.extract_time_image_url(page_content) newsInfo['image_url'] = image_url newsInfo['hashtags'] = r_hastags logo_path = "/home/franv/mast_bot/logos/" + "time.jpg" elif source == "wired": # Perform actions specific to source3 newsInfo['image_url'] = image_utils.extract_wired_image_url(page_content) logo_path = "/home/franv/mast_bot/logos/" + "wired.jpg" elif source == "slashdot": logo_path = "/home/franv/mast_bot/logos/" + "slashdot.jpg" # Not used anymore elif source == "ycombinator": # The link is in fact in the description extract_ycombinator_url(newsInfo) elif source == "lithub": # The link is in fact in the description newsInfo['image_url'] = image_utils.extract_lithub_image_url(page_content) logo_path = "/home/franv/mast_bot/logos/" + "lithub.jpg" print("Lithub image_url:", newsInfo['image_url']) else: ''' # Handle the case when source is not any of the expected values # Extract the correct link from the description field description = newsInfo.get('description', '') start_index = description.find('