From 11c3d1c6b7c718a2c64c795c5326db9966dcf1bd Mon Sep 17 00:00:00 2001 From: fossilfranv Date: Wed, 20 Sep 2023 16:02:00 -0700 Subject: [PATCH] Delete get_news_entry.py --- get_news_entry.py | 418 ---------------------------------------------- 1 file changed, 418 deletions(-) delete mode 100644 get_news_entry.py diff --git a/get_news_entry.py b/get_news_entry.py deleted file mode 100644 index 9bd5a8f..0000000 --- a/get_news_entry.py +++ /dev/null @@ -1,418 +0,0 @@ -import requests -import image_utils -import os -import io -import logging -import cv2 -import imageio -import shutil -import subprocess -import time -import xml.etree.ElementTree as ET -from urllib.parse import urlparse -from pathlib import Path -from bs4 import BeautifulSoup -from PIL import Image, ImageDraw, ImageFont -from urllib.parse import urlparse - - - -RSS_URL = 'https://fresh.franv.site/i/?a=rss&user=fossilfranv&token=sdfggf456456465xcvxcvxvc&hours=168' -MASTODON_TOKEN = 'J65EiYQMpc-hY3CaUJaQPHdXxV7-KiKZjlr0QPESlVQ' -MASTODON_HOST = 'https://mast.airdog.site' - -search_terms = ["slashdot", "time", "bbc", "cbc", "francetvinfo", "lithub", "theguardian", -"vancouversun", "techrepublic", "ycombinator", "slashdot", "time", "spiegel", -"wired", "androidauthority"] - - -# Define the logo_path as it is going to be modified in the process_news function -logo_path = "" -# Define image_path the same way for convenience. -image_path = "/home/franv/mast_bot/images/11.jpg" - - - - - - -def post_to_mastodon(source, title, description, link, image_url, hashtags): - global logo_path - image_path = "/home/franv/mast_bot/images/11.jpg" # - load_images_path = "/home/franv/mast_bot/images/" - - media_ids = [] - - # Write image to folder for later retrieval - if image_url and is_valid_url(image_url): - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} - img_data = requests.get(image_url, headers=headers, timeout=10).content - with open(image_path, 'wb') as handler: - handler.write(img_data) - - ## Add bottom band and logo to image - headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} - new_image = add_bottom_band_with_logo(image_path, 0.15, (220, 220, 220), logo_path) - if not new_image: - new_image = Image.open("/home/franv/mast_bot/logos/news.jpg") - - new_image.save(image_path) - else: - # If no image just replace 11.jpg with default image - temp_image = Image.open("/home/franv/mast_bot/logos/news.jpg") - temp_image.save("/home/franv/mast_bot/images/11.jpg") - - - IMG_FILES = [filename for filename in os.listdir(load_images_path) if os.path.isfile(os.path.join(load_images_path, filename))] - - # Originally wanted to post many images but now only 1 image - for file in IMG_FILES: - files = {'file': open(os.path.join(load_images_path, file), 'rb')} - url = f"{MASTODON_HOST}/api/v1/media" - r = requests.post(url, files=files, headers={'Authorization': f'Bearer {MASTODON_TOKEN}'}) - response_json = r.json() - if r.status_code == 200: - media_id = response_json['id'] - media_ids.append(media_id) - - # Compose status_text which, with images, is the only content posted by mastodon - if source and title and description and link: - status_text = source.upper() + "\n" + "\n" + title.upper() + "\n" + "\n" + " " + description + "\n" + "\n" + link + "\n" + "\n" + str(hashtags) - data = { - "status": status_text, - "media_ids[]": media_ids, - "description": description, - "link": link - } - - # Post to mastodon - url = f"{MASTODON_HOST}/api/v1/statuses" - r = requests.post(url, data=data, headers={'Authorization': f'Bearer {MASTODON_TOKEN}'}) - json_data = r.json() - - return None - - -def read_news(): - # Make a request to the RSS feed URL - response = requests.get(RSS_URL) - - # Check if the request was successful (status code 200) - if response.status_code == 200: - # Parse the XML content using ElementTree - root = ET.fromstring(response.content) - items = list(root.findall('.//item')) - return items - else: - return None - - -def get_news(items): - - # Initialize main counter for the loop - main_counter = 0 - # Iterate over each item element in the XML - for i, item in enumerate(items): - # Use XPath to extract the desired information from each item - title_element = item.find('.//title') - title = title_element.text if title_element is not None else None - - description_element = item.find('.//description') - description = description_element.text if description_element is not None else None - - # Clean the description of non-printable characters - soup = BeautifulSoup(description, 'html.parser') - description = soup.get_text()[:250] - - link_element = item.find('.//link') - link = link_element.text if link_element is not None else None - - enclosure_element = item.find('.//enclosure') - enclosure = enclosure_element.get('url') if enclosure_element is not None else None - - media_ids = [] - - date_element = item.find('.//pubDate') - date = date_element.text if date_element is not None else None - - displaydate_element = item.find('.//displaydate') - displaydate = displaydate_element.text if displaydate_element is not None else None - - # Create a newsInfo object with the extracted information - newsInfo = { - 'title': title, - 'description': description, - 'link': link, - 'enclosure': enclosure, - 'media_ids': media_ids, - 'date': date, - 'displaydate': displaydate, - 'image_url' : None, - 'hashtags' : None - # Add more fields as needed - } - - # Add line feeds to the post to make more legible - print("\n" * 2) - - # Extract the source from the newsInfo.link URL - # The source is going to be needed in process_news - url = newsInfo['link'] - - parsed_url = urlparse(url) - found_term = None - source = None - - # Search for term in url - term_index = 0 - while term_index < len(search_terms) and not found_term: - term = search_terms[term_index] - if term in parsed_url.netloc.lower(): - found_term = term - term_index += 1 - - if found_term is not None: - source = found_term - else: # Look in description - description = newsInfo['description'][:50].lower() - for term in search_terms: - if term in description: - found_term = term - source = found_term - - # Get page content to process_news - try: - response = requests.get(newsInfo['link'], headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}, timeout = 5) - print(response) - - if response.status_code == 200: - page_content = response.text - - if process_news(page_content, source, newsInfo): - if not newsInfo['image_url']: - newsInfo['image_url'] = None - post_to_mastodon(source, newsInfo['title'], newsInfo['description'], newsInfo['link'], newsInfo['image_url'], newsInfo['hashtags']) - except requests.Timeout: - continue - - print(newsInfo) - # Delay posting so as not to overwhelm mastodon - if main_counter < 6: - time.sleep(30) - main_counter += 1 - else: - main_counter = 0 - time.sleep(300) - - # return source, newsInfo - # input("Press Enter to continue...") - # return source, newsInfo - -# Process the news according to source -def process_news(page_content, source, newsInfo): - global logo_path - if source == "androidauthority": - image_url, r_hastags = image_utils.extract_androidauthority_image_url(page_content) - newsInfo['image_url'] = image_url - newsInfo['hashtags'] = r_hastags - logo_path = "/home/franv/mast_bot/logos/" + "androidauthority.jpg" - # Perform actions specific to source1 - # newsInfo['additional_field1'] = "Value for source1" - print(source) - # Modify other fields in newsInfo as needed - - elif source == "bbc": - image_url, r_hastags = image_utils.extract_bbc_image_url(page_content) - newsInfo['image_url'] = image_url - newsInfo['hashtags'] = r_hastags - logo_path = "/home/franv/mast_bot/logos/" + "bbc.jpg" - - - elif source == "cbc": - image_url, r_hastags = image_utils.extract_cbc_image_url(page_content) - newsInfo['image_url'] = image_url - newsInfo['hashtags'] = r_hastags - logo_path = "/home/franv/mast_bot/logos/" + "cbc.jpg" - - elif source == "francetvinfo": - image_url, r_hastags = image_utils.extract_francetvinfo_image_url(page_content) - newsInfo['image_url'] = image_url - newsInfo['hashtags'] = r_hastags - logo_path = "/home/franv/mast_bot/logos/" + "franceinfo.jpg" - - elif source == "theguardian": - image_url, r_hastags = image_utils.extract_theguardian_image_url(page_content) - newsInfo['image_url'] = image_url - newsInfo['hashtags'] = r_hastags - logo_path = "/home/franv/mast_bot/logos/" + "theguardian.jpg" - - elif source == "vancouversun": - image_url, r_hastags = image_utils.extract_vancouver_image_url(page_content) - newsInfo['image_url'] = image_url - newsInfo['hashtags'] = r_hastags - logo_path = "/home/franv/mast_bot/logos/" + "vancouversun.jpg" - - elif source == "techrepublic": - # Perform actions specific to source3 - newsInfo['image_url'] = image_utils.extract_techrepublic_image_url(page_content) - logo_path = "/home/franv/mast_bot/logos/" + "techrepublic.jpg" - - elif source == "time": - image_url, r_hastags = image_utils.extract_time_image_url(page_content) - newsInfo['image_url'] = image_url - newsInfo['hashtags'] = r_hastags - logo_path = "/home/franv/mast_bot/logos/" + "time.jpg" - - elif source == "wired": - # Perform actions specific to source3 - newsInfo['image_url'] = image_utils.extract_wired_image_url(page_content) - logo_path = "/home/franv/mast_bot/logos/" + "wired.jpg" - - elif source == "slashdot": - logo_path = "/home/franv/mast_bot/logos/" + "slashdot.jpg" - - # Not used anymore - elif source == "ycombinator": - # The link is in fact in the description - extract_ycombinator_url(newsInfo) - - elif source == "lithub": - # The link is in fact in the description - newsInfo['image_url'] = image_utils.extract_lithub_image_url(page_content) - logo_path = "/home/franv/mast_bot/logos/" + "lithub.jpg" - print("Lithub image_url:", newsInfo['image_url']) - - - else: - ''' # Handle the case when source is not any of the expected values - # Extract the correct link from the description field - description = newsInfo.get('description', '') - start_index = description.find('