diff --git a/get_news_entry.py b/get_news_entry.py new file mode 100644 index 0000000..9bd5a8f --- /dev/null +++ b/get_news_entry.py @@ -0,0 +1,418 @@ +import requests +import image_utils +import os +import io +import logging +import cv2 +import imageio +import shutil +import subprocess +import time +import xml.etree.ElementTree as ET +from urllib.parse import urlparse +from pathlib import Path +from bs4 import BeautifulSoup +from PIL import Image, ImageDraw, ImageFont +from urllib.parse import urlparse + + + +RSS_URL = 'https://fresh.franv.site/i/?a=rss&user=fossilfranv&token=sdfggf456456465xcvxcvxvc&hours=168' +MASTODON_TOKEN = 'J65EiYQMpc-hY3CaUJaQPHdXxV7-KiKZjlr0QPESlVQ' +MASTODON_HOST = 'https://mast.airdog.site' + +search_terms = ["slashdot", "time", "bbc", "cbc", "francetvinfo", "lithub", "theguardian", +"vancouversun", "techrepublic", "ycombinator", "slashdot", "time", "spiegel", +"wired", "androidauthority"] + + +# Define the logo_path as it is going to be modified in the process_news function +logo_path = "" +# Define image_path the same way for convenience. +image_path = "/home/franv/mast_bot/images/11.jpg" + + + + + + +def post_to_mastodon(source, title, description, link, image_url, hashtags): + global logo_path + image_path = "/home/franv/mast_bot/images/11.jpg" # + load_images_path = "/home/franv/mast_bot/images/" + + media_ids = [] + + # Write image to folder for later retrieval + if image_url and is_valid_url(image_url): + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} + img_data = requests.get(image_url, headers=headers, timeout=10).content + with open(image_path, 'wb') as handler: + handler.write(img_data) + + ## Add bottom band and logo to image + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} + new_image = add_bottom_band_with_logo(image_path, 0.15, (220, 220, 220), logo_path) + if not new_image: + new_image = Image.open("/home/franv/mast_bot/logos/news.jpg") + + new_image.save(image_path) + else: + # If no image just replace 11.jpg with default image + temp_image = Image.open("/home/franv/mast_bot/logos/news.jpg") + temp_image.save("/home/franv/mast_bot/images/11.jpg") + + + IMG_FILES = [filename for filename in os.listdir(load_images_path) if os.path.isfile(os.path.join(load_images_path, filename))] + + # Originally wanted to post many images but now only 1 image + for file in IMG_FILES: + files = {'file': open(os.path.join(load_images_path, file), 'rb')} + url = f"{MASTODON_HOST}/api/v1/media" + r = requests.post(url, files=files, headers={'Authorization': f'Bearer {MASTODON_TOKEN}'}) + response_json = r.json() + if r.status_code == 200: + media_id = response_json['id'] + media_ids.append(media_id) + + # Compose status_text which, with images, is the only content posted by mastodon + if source and title and description and link: + status_text = source.upper() + "\n" + "\n" + title.upper() + "\n" + "\n" + " " + description + "\n" + "\n" + link + "\n" + "\n" + str(hashtags) + data = { + "status": status_text, + "media_ids[]": media_ids, + "description": description, + "link": link + } + + # Post to mastodon + url = f"{MASTODON_HOST}/api/v1/statuses" + r = requests.post(url, data=data, headers={'Authorization': f'Bearer {MASTODON_TOKEN}'}) + json_data = r.json() + + return None + + +def read_news(): + # Make a request to the RSS feed URL + response = requests.get(RSS_URL) + + # Check if the request was successful (status code 200) + if response.status_code == 200: + # Parse the XML content using ElementTree + root = ET.fromstring(response.content) + items = list(root.findall('.//item')) + return items + else: + return None + + +def get_news(items): + + # Initialize main counter for the loop + main_counter = 0 + # Iterate over each item element in the XML + for i, item in enumerate(items): + # Use XPath to extract the desired information from each item + title_element = item.find('.//title') + title = title_element.text if title_element is not None else None + + description_element = item.find('.//description') + description = description_element.text if description_element is not None else None + + # Clean the description of non-printable characters + soup = BeautifulSoup(description, 'html.parser') + description = soup.get_text()[:250] + + link_element = item.find('.//link') + link = link_element.text if link_element is not None else None + + enclosure_element = item.find('.//enclosure') + enclosure = enclosure_element.get('url') if enclosure_element is not None else None + + media_ids = [] + + date_element = item.find('.//pubDate') + date = date_element.text if date_element is not None else None + + displaydate_element = item.find('.//displaydate') + displaydate = displaydate_element.text if displaydate_element is not None else None + + # Create a newsInfo object with the extracted information + newsInfo = { + 'title': title, + 'description': description, + 'link': link, + 'enclosure': enclosure, + 'media_ids': media_ids, + 'date': date, + 'displaydate': displaydate, + 'image_url' : None, + 'hashtags' : None + # Add more fields as needed + } + + # Add line feeds to the post to make more legible + print("\n" * 2) + + # Extract the source from the newsInfo.link URL + # The source is going to be needed in process_news + url = newsInfo['link'] + + parsed_url = urlparse(url) + found_term = None + source = None + + # Search for term in url + term_index = 0 + while term_index < len(search_terms) and not found_term: + term = search_terms[term_index] + if term in parsed_url.netloc.lower(): + found_term = term + term_index += 1 + + if found_term is not None: + source = found_term + else: # Look in description + description = newsInfo['description'][:50].lower() + for term in search_terms: + if term in description: + found_term = term + source = found_term + + # Get page content to process_news + try: + response = requests.get(newsInfo['link'], headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}, timeout = 5) + print(response) + + if response.status_code == 200: + page_content = response.text + + if process_news(page_content, source, newsInfo): + if not newsInfo['image_url']: + newsInfo['image_url'] = None + post_to_mastodon(source, newsInfo['title'], newsInfo['description'], newsInfo['link'], newsInfo['image_url'], newsInfo['hashtags']) + except requests.Timeout: + continue + + print(newsInfo) + # Delay posting so as not to overwhelm mastodon + if main_counter < 6: + time.sleep(30) + main_counter += 1 + else: + main_counter = 0 + time.sleep(300) + + # return source, newsInfo + # input("Press Enter to continue...") + # return source, newsInfo + +# Process the news according to source +def process_news(page_content, source, newsInfo): + global logo_path + if source == "androidauthority": + image_url, r_hastags = image_utils.extract_androidauthority_image_url(page_content) + newsInfo['image_url'] = image_url + newsInfo['hashtags'] = r_hastags + logo_path = "/home/franv/mast_bot/logos/" + "androidauthority.jpg" + # Perform actions specific to source1 + # newsInfo['additional_field1'] = "Value for source1" + print(source) + # Modify other fields in newsInfo as needed + + elif source == "bbc": + image_url, r_hastags = image_utils.extract_bbc_image_url(page_content) + newsInfo['image_url'] = image_url + newsInfo['hashtags'] = r_hastags + logo_path = "/home/franv/mast_bot/logos/" + "bbc.jpg" + + + elif source == "cbc": + image_url, r_hastags = image_utils.extract_cbc_image_url(page_content) + newsInfo['image_url'] = image_url + newsInfo['hashtags'] = r_hastags + logo_path = "/home/franv/mast_bot/logos/" + "cbc.jpg" + + elif source == "francetvinfo": + image_url, r_hastags = image_utils.extract_francetvinfo_image_url(page_content) + newsInfo['image_url'] = image_url + newsInfo['hashtags'] = r_hastags + logo_path = "/home/franv/mast_bot/logos/" + "franceinfo.jpg" + + elif source == "theguardian": + image_url, r_hastags = image_utils.extract_theguardian_image_url(page_content) + newsInfo['image_url'] = image_url + newsInfo['hashtags'] = r_hastags + logo_path = "/home/franv/mast_bot/logos/" + "theguardian.jpg" + + elif source == "vancouversun": + image_url, r_hastags = image_utils.extract_vancouver_image_url(page_content) + newsInfo['image_url'] = image_url + newsInfo['hashtags'] = r_hastags + logo_path = "/home/franv/mast_bot/logos/" + "vancouversun.jpg" + + elif source == "techrepublic": + # Perform actions specific to source3 + newsInfo['image_url'] = image_utils.extract_techrepublic_image_url(page_content) + logo_path = "/home/franv/mast_bot/logos/" + "techrepublic.jpg" + + elif source == "time": + image_url, r_hastags = image_utils.extract_time_image_url(page_content) + newsInfo['image_url'] = image_url + newsInfo['hashtags'] = r_hastags + logo_path = "/home/franv/mast_bot/logos/" + "time.jpg" + + elif source == "wired": + # Perform actions specific to source3 + newsInfo['image_url'] = image_utils.extract_wired_image_url(page_content) + logo_path = "/home/franv/mast_bot/logos/" + "wired.jpg" + + elif source == "slashdot": + logo_path = "/home/franv/mast_bot/logos/" + "slashdot.jpg" + + # Not used anymore + elif source == "ycombinator": + # The link is in fact in the description + extract_ycombinator_url(newsInfo) + + elif source == "lithub": + # The link is in fact in the description + newsInfo['image_url'] = image_utils.extract_lithub_image_url(page_content) + logo_path = "/home/franv/mast_bot/logos/" + "lithub.jpg" + print("Lithub image_url:", newsInfo['image_url']) + + + else: + ''' # Handle the case when source is not any of the expected values + # Extract the correct link from the description field + description = newsInfo.get('description', '') + start_index = description.find('