import asyncio import aiohttp from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning import copy import datetime import discord import feedparser import filetype import io import itertools import logging import re import time import warnings from typing import Optional, Union from types import MappingProxyType, SimpleNamespace from urllib.parse import urlparse from redbot.core import checks, commands, Config from redbot.core.utils import can_user_send_messages_in from redbot.core.utils.chat_formatting import bold, box, escape, humanize_list, pagify from .color import Color from .quiet_template import QuietTemplate from .rss_feed import RssFeed from .tag_type import INTERNAL_TAGS, VALID_IMAGES, TagType log = logging.getLogger("red.aikaterna.rss") IPV4_RE = re.compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}") IPV6_RE = re.compile("([a-f0-9:]+:+)+[a-f0-9]+") GuildMessageable = Union[discord.TextChannel, discord.VoiceChannel, discord.StageChannel, discord.Thread] __version__ = "2.1.8" warnings.filterwarnings( "ignore", category=DeprecationWarning, # Ignore the warning in feedparser module *and* our module to account for the unreleased fix of this warning: # https://github.com/kurtmckee/feedparser/pull/278 module=r"^(feedparser|rss)(\..+)?$", message=( "To avoid breaking existing software while fixing issue 310, a temporary mapping has been created from" " `updated_parsed` to `published_parsed` if `updated_parsed` doesn't exist" ), ) warnings.filterwarnings("ignore", module="rss", category=MarkupResemblesLocatorWarning) class RSS(commands.Cog): """RSS feeds for your server.""" def __init__(self, bot): self.bot = bot self.config = Config.get_conf(self, 2761331001, force_registration=True) self.config.register_channel(feeds={}) self.config.register_global(use_published=["www.youtube.com"]) self._post_queue = asyncio.PriorityQueue() self._post_queue_size = None self._read_feeds_loop = None self._headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"} async def red_delete_data_for_user(self, **kwargs): """Nothing to delete""" return def initialize(self): self._read_feeds_loop = self.bot.loop.create_task(self.read_feeds()) def cog_unload(self): if self._read_feeds_loop: self._read_feeds_loop.cancel() def _add_content_images(self, bs4_soup: BeautifulSoup, rss_object: feedparser.util.FeedParserDict): """ $content_images should always be marked as a special tag as the tags will be dynamically generated based on the content included in the latest post. """ content_images = bs4_soup.find_all("img") if content_images: for i, image in enumerate(content_images): tag_name = f"content_image{str(i + 1).zfill(2)}" try: rss_object[tag_name] = image["src"] rss_object["is_special"].append(tag_name) except KeyError: pass return rss_object async def _add_feed(self, ctx, feed_name: str, channel: GuildMessageable, url: str): """Helper for rss add.""" rss_exists = await self._check_feed_existing(ctx, feed_name, channel) if not rss_exists: feedparser_obj = await self._fetch_feedparser_object(url) if not feedparser_obj: await ctx.send("Couldn't fetch that feed: there were no feed objects found.") return # sort everything by time if a time value is present if feedparser_obj.entries: # this feed has posts sorted_feed_by_post_time = await self._sort_by_post_time(feedparser_obj.entries) else: # this feed does not have posts, but it has a header with channel information sorted_feed_by_post_time = [feedparser_obj.feed] # add additional tags/images/clean html feedparser_plus_obj = await self._add_to_feedparser_object(sorted_feed_by_post_time[0], url) rss_object = await self._convert_feedparser_to_rssfeed(feed_name, feedparser_plus_obj, url) async with self.config.channel(channel).feeds() as feed_data: feed_data[feed_name] = rss_object.to_json() msg = ( f"Feed `{feed_name}` added in channel: {channel.mention}\n" f"List the template tags with `{ctx.prefix}rss listtags` " f"and modify the template using `{ctx.prefix}rss template`." ) await ctx.send(msg) else: await ctx.send(f"There is already an existing feed named {bold(feed_name)} in {channel.mention}.") return def _add_generic_html_plaintext(self, bs4_soup: BeautifulSoup): """ Bs4's .text attribute on a soup strips newlines and spaces This provides newlines and more readable content. """ text = "" for element in bs4_soup.descendants: if isinstance(element, str): text += element elif element.name == "br" or element.name == "p" or element.name == "li": text += "\n" text = re.sub("\\n+", "\n", text) text = text.replace("*", "\\*") text = text.replace("SC_OFF", "").replace("SC_ON", "\n") text = text.replace("[link]", "").replace("[comments]", "") return escape(text) async def _append_bs4_tags(self, rss_object: feedparser.util.FeedParserDict, url: str): """Append bs4-discovered tags to an rss_feed/feedparser object.""" rss_object["is_special"] = [] soup = None tags_list = [] temp_rss_obect = copy.deepcopy(rss_object) for tag_name, tag_content in temp_rss_obect.items(): if tag_name in INTERNAL_TAGS: continue tag_content_check = await self._get_tag_content_type(tag_content) if tag_content_check == TagType.HTML: # this is a tag that is only html content try: soup = BeautifulSoup(tag_content, "html.parser") except TypeError: pass # this is a standard html format summary_detail tag # the tag was determined to be html through the type attrib that # was attached from the feed publisher but it's really a dict. try: soup = BeautifulSoup(tag_content["value"], "html.parser") except (KeyError, TypeError): pass # this is a standard html format content or summary tag try: soup = BeautifulSoup(tag_content[0]["value"], "html.parser") except (KeyError, TypeError): pass if soup: rss_object[f"{tag_name}_plaintext"] = self._add_generic_html_plaintext(soup) if tag_content_check == TagType.LIST: tags_content_counter = 0 for list_item in tag_content: list_item_check = await self._get_tag_content_type(list_item) # for common "links" format or when "content" is a list list_html_content_counter = 0 if list_item_check == TagType.HTML: list_tags = ["value", "href"] for tag in list_tags: try: url_check = await self._valid_url(list_item[tag], feed_check=False) if not url_check: # bs4 will cry if you try to give it a url to parse, so let's only # parse non-url content tag_content = BeautifulSoup(list_item[tag], "html.parser") tag_content = self._add_generic_html_plaintext(tag_content) else: tag_content = list_item[tag] list_html_content_counter += 1 name = f"{tag_name}_plaintext{str(list_html_content_counter).zfill(2)}" rss_object[name] = tag_content rss_object["is_special"].append(name) except (KeyError, TypeError): pass if list_item_check == TagType.DICT: authors_content_counter = 0 enclosure_content_counter = 0 enclosure_url_counter = 0 # common "authors" tag format try: authors_content_counter += 1 name = f"{tag_name}_plaintext{str(authors_content_counter).zfill(2)}" tag_content = BeautifulSoup(list_item["name"], "html.parser") rss_object[name] = tag_content.get_text() rss_object["is_special"].append(name) except KeyError: pass # common "enclosure" tag image format # note: this is not adhering to RSS feed specifications # proper enclosure tags should have `length`, `type`, `url` # and not `href`, `type`, `rel` # but, this is written for the first feed I have seen with an "enclosure" tag try: image_url = list_item["href"] image_type = list_item["type"] image_rel = list_item["rel"] enclosure_content_counter += 1 name = f"media_plaintext{str(enclosure_content_counter).zfill(2)}" rss_object[name] = image_url rss_object["is_special"].append(name) except KeyError: pass # special tag for enclosure["url"] so that users can differentiate them # from image urls found in enclosure["href"] try: image_url = list_item["url"] enclosure_url_counter += 1 name = f"media_url{str(enclosure_url_counter).zfill(2)}" rss_object[name] = image_url rss_object["is_special"].append(name) except KeyError: pass # common "tags" tag format try: tag = list_item["term"] tags_content_counter += 1 name = f"{tag_name}_plaintext{str(tags_content_counter).zfill(2)}" rss_object[name] = tag rss_object["is_special"].append(name) tags_list.append(tag) if tag not in tags_list else tags_list except KeyError: pass if len(tags_list) > 0: rss_object["tags_list"] = tags_list rss_object["tags_plaintext_list"] = humanize_list(tags_list) rss_object["is_special"].append("tags_list") rss_object["is_special"].append("tags_plaintext_list") # if image dict tag exists, check for an image try: rss_object["image_plaintext"] = rss_object["image"]["href"] rss_object["is_special"].append("image_plaintext") except KeyError: pass # if media_thumbnail or media_content exists, return the first friendly url try: rss_object["media_content_plaintext"] = rss_object["media_content"][0]["url"] rss_object["is_special"].append("media_content_plaintext") except KeyError: pass try: rss_object["media_thumbnail_plaintext"] = rss_object["media_thumbnail"][0]["url"] rss_object["is_special"].append("media_thumbnail_plaintext") except KeyError: pass # change published_parsed and updated_parsed into a datetime object for embed footers for time_tag in ["updated_parsed", "published_parsed"]: try: if isinstance(rss_object[time_tag], time.struct_time): rss_object[f"{time_tag}_datetime"] = datetime.datetime(*rss_object[time_tag][:6]) except KeyError: pass if soup: rss_object = self._add_content_images(soup, rss_object) # add special tag/special site formatter here if needed in the future return rss_object async def _check_channel_permissions(self, ctx, channel: GuildMessageable, addl_send_messages_check=True): """Helper for rss functions.""" if not channel.permissions_for(ctx.me).read_messages: await ctx.send("I don't have permissions to read that channel.") return False author_perms = channel.permissions_for(ctx.author) if not author_perms.read_messages: await ctx.send("You don't have permissions to read that channel.") return False # bot can only see threads that it has permissions to read messages in so no special handling needed # if author has read messages perm, they can read all public threads *but also* private threads they are in if isinstance(channel, discord.Thread) and channel.is_private() and not author_perms.manage_threads: try: await channel.fetch_member(ctx.author.id) except discord.NotFound: # author is not in a private thread return False if addl_send_messages_check: # check for send messages perm if needed, like on an rss add # not needed on something like rss delete if not can_user_send_messages_in(ctx.me, channel): await ctx.send("I don't have permissions to send messages in that channel.") return False else: return True else: return True async def _check_feed_existing(self, ctx, feed_name: str, channel: GuildMessageable): """Helper for rss functions.""" rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: return False return True async def _delete_feed(self, ctx, feed_name: str, channel: GuildMessageable): """Helper for rss delete.""" rss_exists = await self._check_feed_existing(ctx, feed_name, channel) if rss_exists: async with self.config.channel(channel).feeds() as rss_data: rss_data.pop(feed_name, None) return True return False async def _edit_template(self, ctx, feed_name: str, channel: GuildMessageable, template: str): """Helper for rss template.""" rss_exists = await self._check_feed_existing(ctx, feed_name, channel) if rss_exists: async with self.config.channel(channel).feeds.all() as feed_data: if feed_name not in feed_data: feed_data[feed_name] = {} feed_data[feed_name]["template"] = template return True return False @staticmethod def _find_website(website_url: str): """Helper for rss parse.""" result = urlparse(website_url) if result.scheme: # https://www.website.com/... if result.netloc: website = result.netloc else: return None else: # www.website.com/... if result.path: website = result.path.split("/")[0] else: return None return website async def _get_channel_object(self, channel_id: int): """Helper for rss feed loop.""" channel = self.bot.get_channel(channel_id) if not channel: try: channel = await self.bot.fetch_channel(channel_id) except (discord.errors.Forbidden, discord.errors.NotFound): return None if channel and can_user_send_messages_in(channel.guild.me, channel): return channel return None async def _get_feed_names(self, channel: GuildMessageable): """Helper for rss list/listall.""" feed_list = [] space = "\N{SPACE}" all_feeds = await self.config.channel(channel).feeds.all() if not all_feeds: return ["None."] longest_name_len = len(max(list(all_feeds.keys()), key=len)) for name, data in all_feeds.items(): extra_spacing = longest_name_len - len(name) feed_list.append(f"{name}{space * extra_spacing} {data['url']}") return feed_list async def _get_tag_content_type(self, tag_content): """ Tag content type can be: str, list, dict (FeedParserDict), bool, datetime.datetime object or time.struct_time """ try: if tag_content["type"] == "text/html": return TagType(2) except (KeyError, TypeError): html_tags = ["", "", "", "", ""] if any(word in str(tag_content) for word in html_tags): return TagType(2) if isinstance(tag_content, dict): return TagType(3) elif isinstance(tag_content, list): return TagType(4) else: return TagType(1) async def _get_url_content(self, url): """Helper for rss add/_valid_url.""" try: # force github.com to serve us xml instead of json headers = self._headers if "github.com" in url: headers["Accept"] = "application/vnd.github+xml" timeout = aiohttp.ClientTimeout(total=20) async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session: async with session.get(url) as resp: if resp.status == 404: friendly_msg = "The server returned 404 Not Found. Check your url and try again." return None, friendly_msg html = await resp.read() return html, None except aiohttp.client_exceptions.ClientConnectorError: friendly_msg = "There was an OSError or the connection failed." msg = f"aiohttp failure accessing feed at url:\n\t{url}" log.error(msg, exc_info=True) return None, friendly_msg except aiohttp.client_exceptions.ClientPayloadError as e: friendly_msg = "The website closed the connection prematurely or the response was malformed.\n" friendly_msg += f"The error returned was: `{str(e)}`\n" friendly_msg += "For more technical information, check your bot's console or logs." msg = f"content error while reading feed at url:\n\t{url}" log.error(msg, exc_info=True) return None, friendly_msg except asyncio.exceptions.TimeoutError: friendly_msg = "The bot timed out while trying to access that content." msg = f"asyncio timeout while accessing feed at url:\n\t{url}" log.error(msg, exc_info=True) return None, friendly_msg except aiohttp.client_exceptions.ServerDisconnectedError: friendly_msg = "The target server disconnected early without a response." msg = f"server disconnected while accessing feed at url:\n\t{url}" log.error(msg, exc_info=True) return None, friendly_msg except Exception: friendly_msg = "There was an unexpected error. Check your console for more information." msg = f"General failure accessing feed at url:\n\t{url}" log.error(msg, exc_info=True) return None, friendly_msg async def _fetch_feedparser_object(self, url: str): """Get a full feedparser object from a url: channel header + items.""" html, error_msg = await self._get_url_content(url) if not html: return SimpleNamespace(entries=None, error=error_msg, url=url) feedparser_obj = feedparser.parse(html) if feedparser_obj.bozo: error_msg = f"Bozo feed: feedparser is unable to parse the response from {url}.\n" error_msg += f"Feedparser error message: `{feedparser_obj.bozo_exception}`" return SimpleNamespace(entries=None, error=error_msg, url=url) return feedparser_obj async def _add_to_feedparser_object(self, feedparser_obj: feedparser.util.FeedParserDict, url: str): """ Input: A feedparser object Process: Append custom tags to the object from the custom formatters Output: A feedparser object with additional attributes """ feedparser_plus_obj = await self._append_bs4_tags(feedparser_obj, url) feedparser_plus_obj["template_tags"] = sorted(feedparser_plus_obj.keys()) return feedparser_plus_obj async def _convert_feedparser_to_rssfeed( self, feed_name: str, feedparser_plus_obj: feedparser.util.FeedParserDict, url: str ): """ Converts any feedparser/feedparser_plus object to an RssFeed object. Used in rss add when saving a new feed. """ entry_time = await self._time_tag_validation(feedparser_plus_obj) # sometimes there's no title or no link attribute and feedparser doesn't really play nice with that try: feedparser_plus_obj_title = feedparser_plus_obj["title"] except KeyError: feedparser_plus_obj_title = "" try: feedparser_plus_obj_link = feedparser_plus_obj["link"] except KeyError: feedparser_plus_obj_link = "" rss_object = RssFeed( name=feed_name.lower(), last_title=feedparser_plus_obj_title, last_link=feedparser_plus_obj_link, last_time=entry_time, template="$title\n$link", url=url, template_tags=feedparser_plus_obj["template_tags"], is_special=feedparser_plus_obj["is_special"], embed=True, ) return rss_object async def _sort_by_post_time(self, feedparser_obj: feedparser.util.FeedParserDict): base_url = urlparse(feedparser_obj[0].get("link")).netloc use_published_parsed_override = await self.config.use_published() if base_url in use_published_parsed_override: time_tag = ["published_parsed"] else: time_tag = ["updated_parsed", "published_parsed"] for tag in time_tag: try: baseline_time = time.struct_time((2021, 1, 1, 12, 0, 0, 4, 1, -1)) sorted_feed_by_post_time = sorted(feedparser_obj, key=lambda x: x.get(tag, baseline_time), reverse=True) break except TypeError: sorted_feed_by_post_time = feedparser_obj return sorted_feed_by_post_time async def _time_tag_validation(self, entry: feedparser.util.FeedParserDict): """Gets a unix timestamp if it's available from a single feedparser post entry.""" feed_link = entry.get("link", None) if feed_link: base_url = urlparse(feed_link).netloc else: return None # check for a feed time override, if a feed is being problematic regarding updated_parsed # usage (i.e. a feed entry keeps reposting with no perceived change in content) use_published_parsed_override = await self.config.use_published() if base_url in use_published_parsed_override: entry_time = entry.get("published_parsed", None) else: entry_time = entry.get("updated_parsed", None) if not entry_time: entry_time = entry.get("published_parsed", None) if isinstance(entry_time, time.struct_time): entry_time = time.mktime(entry_time) if entry_time: return int(entry_time) return None @staticmethod async def _title_case(phrase: str): exceptions = ["a", "and", "in", "of", "or", "on", "the"] lowercase_words = re.split(" ", phrase.lower()) final_words = [lowercase_words[0].capitalize()] final_words += [word if word in exceptions else word.capitalize() for word in lowercase_words[1:]] return " ".join(final_words) async def _update_last_scraped( self, channel: GuildMessageable, feed_name: str, current_feed_title: str, current_feed_link: str, current_feed_time: int, ): """Updates last title and last link seen for comparison on next feed pull.""" async with self.config.channel(channel).feeds() as feed_data: try: feed_data[feed_name]["last_title"] = current_feed_title feed_data[feed_name]["last_link"] = current_feed_link feed_data[feed_name]["last_time"] = current_feed_time except KeyError: # the feed was deleted during a _get_current_feed execution pass async def _valid_url(self, url: str, feed_check=True): """Helper for rss add.""" try: result = urlparse(url) except Exception as e: log.exception(e, exc_info=e) return False if all([result.scheme, result.netloc, result.path]): if feed_check: text, error_msg = await self._get_url_content(url) if not text: raise NoFeedContent(error_msg) return False rss = feedparser.parse(text) if rss.bozo: error_message = rss.feed.get("summary", str(rss))[:1500] error_message = re.sub(IPV4_RE, "[REDACTED IP ADDRESS]", error_message) error_message = re.sub(IPV6_RE, "[REDACTED IP ADDRESS]", error_message) msg = f"Bozo feed: feedparser is unable to parse the response from {url}.\n\n" msg += "Received content preview:\n" msg += box(error_message) raise NoFeedContent(msg) return False else: return True else: return True else: return False async def _validate_image(self, url: str): """Helper for _get_current_feed_embed.""" try: timeout = aiohttp.ClientTimeout(total=20) async with aiohttp.ClientSession(headers=self._headers, timeout=timeout) as session: async with session.get(url) as resp: image = await resp.content.read(261) img = io.BytesIO(image) file_type = filetype.guess(img) if not file_type: return None return file_type.extension except aiohttp.client_exceptions.InvalidURL: return None except asyncio.exceptions.TimeoutError: log.error(f"asyncio timeout while accessing image at url:\n\t{url}", exc_info=True) return None except Exception: log.error(f"Failure accessing image in embed feed at url:\n\t{url}", exc_info=True) return None @commands.guild_only() @commands.group() @checks.mod_or_permissions(manage_channels=True) async def rss(self, ctx): """RSS feed stuff.""" pass @rss.command(name="add") async def _rss_add(self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None, *, url: str): """ Add an RSS feed to a channel. Defaults to the current channel if no channel is specified. """ if feed_name.startswith("<#"): # someone typed a channel name but not a feed name msg = "Try again with a feed name included in the right spot so that you can refer to the feed later.\n" msg += f"Example: `{ctx.prefix}rss add feed_name channel_name feed_url`" await ctx.send(msg) return channel = channel or ctx.channel channel_permission_check = await self._check_channel_permissions(ctx, channel) if not channel_permission_check: return async with ctx.typing(): try: valid_url = await self._valid_url(url) except NoFeedContent as e: await ctx.send(str(e)) return if valid_url: await self._add_feed(ctx, feed_name.lower(), channel, url) else: await ctx.send("Invalid or unavailable URL.") @rss.group(name="embed") async def _rss_embed(self, ctx): """Embed feed settings.""" pass @_rss_embed.command(name="color", aliases=["colour"]) async def _rss_embed_color( self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None, *, color: str = None ): """ Set an embed color for a feed. Use this command with no color to reset to the default. `color` must be a hex code like #990000, a [Discord color name](https://discordpy.readthedocs.io/en/latest/api.html#colour), or a [CSS3 color name](https://www.w3.org/TR/2018/REC-css-color-3-20180619/#svg-color). """ channel = channel or ctx.channel rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: await ctx.send("That feed name doesn't exist in this channel.") return embed_toggle = rss_feed["embed"] embed_state_message = "" if not embed_toggle: embed_state_message += ( f"{bold(feed_name)} is not currently set to be in an embed. " f"Toggle it on with `{ctx.prefix}rss embed toggle`.\n" ) if not color: async with self.config.channel(channel).feeds() as feed_data: feed_data[feed_name]["embed_color"] = None await ctx.send( f"{embed_state_message}The color for {bold(feed_name)} has been reset. " "Use this command with a color argument to set a color for this feed." ) return color = color.replace(" ", "_") hex_code = await Color()._color_converter(color) if not hex_code: await ctx.send( "Not a valid color code. Use a hex code like #990000, a " "Discord color name or a CSS3 color name.\n" "\n" "" ) return user_facing_hex = hex_code.replace("0x", "#") color_name = await Color()._hex_to_css3_name(hex_code) # 0xFFFFFF actually doesn't show up as white in an embed # so let's make it close enough to count if hex_code == "0xFFFFFF": hex_code = "0xFFFFFE" async with self.config.channel(channel).feeds() as feed_data: # data is always a 0xFFFFFF style value feed_data[feed_name]["embed_color"] = hex_code await ctx.send(f"Embed color for {bold(feed_name)} set to {user_facing_hex} ({color_name}).") @_rss_embed.command(name="image") async def _rss_embed_image( self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None, image_tag_name: str = None ): """ Set a tag to be a large embed image. This image will be applied to the last embed in the paginated list. Use this command with no image_tag_name to clear the embed image. """ channel = channel or ctx.channel rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: await ctx.send("That feed name doesn't exist in this channel.") return embed_toggle = rss_feed["embed"] embed_state_message = "" if not embed_toggle: embed_state_message += ( f"{bold(feed_name)} is not currently set to be in an embed. " f"Toggle it on with `{ctx.prefix}rss embed toggle`.\n" ) if image_tag_name is not None: if image_tag_name.startswith("$"): image_tag_name = image_tag_name.strip("$") else: msg = "You must use a feed tag for this setting. " msg += f"Feed tags start with `$` and can be found by using `{ctx.prefix}rss listtags` " msg += "with the saved feed name.\nImages that are scraped from feed content are usually " msg += "stored under the tags styled similar to `$content_image01`: subsequent scraped images " msg += "will be in tags named `$content_image02`, `$content_image03`, etc. Not every feed entry " msg += "will have the same amount of scraped image tags. Images can also be found under tags named " msg += "`$media_content_plaintext`, if present.\nExperiment with tags by setting them as your " msg += ( f"template with `{ctx.prefix}rss template` and using `{ctx.prefix}rss force` to view the content." ) await ctx.send(msg) return async with self.config.channel(channel).feeds() as feed_data: feed_data[feed_name]["embed_image"] = image_tag_name if image_tag_name: await ctx.send(f"{embed_state_message}Embed image set to the ${image_tag_name} tag.") else: await ctx.send( "Embed image has been cleared. Use this command with a tag name if you intended to set an image tag." ) @_rss_embed.command(name="thumbnail") async def _rss_embed_thumbnail( self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None, thumbnail_tag_name: str = None ): """ Set a tag to be a thumbnail image. This thumbnail will be applied to the first embed in the paginated list. Use this command with no thumbnail_tag_name to clear the embed thumbnail. """ channel = channel or ctx.channel rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: await ctx.send("That feed name doesn't exist in this channel.") return embed_toggle = rss_feed["embed"] embed_state_message = "" if not embed_toggle: embed_state_message += ( f"{bold(feed_name)} is not currently set to be in an embed. " f"Toggle it on with `{ctx.prefix}rss embed toggle`.\n" ) if thumbnail_tag_name is not None: if thumbnail_tag_name.startswith("$"): thumbnail_tag_name = thumbnail_tag_name.strip("$") else: msg = "You must use a feed tag for this setting. " msg += f"Feed tags start with `$` and can be found by using `{ctx.prefix}rss listtags` " msg += "with the saved feed name.\nImages that are scraped from feed content are usually " msg += "stored under the tags styled similar to `$content_image01`: subsequent scraped images " msg += "will be in tags named `$content_image02`, `$content_image03`, etc. Not every feed entry " msg += "will have the same amount of scraped image tags. Images can also be found under tags named " msg += "`$media_content_plaintext`, if present.\nExperiment with tags by setting them as your " msg += ( f"template with `{ctx.prefix}rss template` and using `{ctx.prefix}rss force` to view the content." ) await ctx.send(msg) return async with self.config.channel(channel).feeds() as feed_data: feed_data[feed_name]["embed_thumbnail"] = thumbnail_tag_name if thumbnail_tag_name: await ctx.send(f"{embed_state_message}Embed thumbnail set to the ${thumbnail_tag_name} tag.") else: await ctx.send( "Embed thumbnail has been cleared. " "Use this command with a tag name if you intended to set a thumbnail tag." ) @_rss_embed.command(name="toggle") async def _rss_embed_toggle(self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None): """ Toggle whether a feed is sent in an embed or not. If the bot doesn't have permissions to post embeds, the feed will always be plain text, even if the embed toggle is set. """ channel = channel or ctx.channel rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: await ctx.send("That feed name doesn't exist in this channel.") return embed_toggle = rss_feed["embed"] toggle_text = "disabled" if embed_toggle else "enabled" async with self.config.channel(channel).feeds() as feed_data: feed_data[feed_name]["embed"] = not embed_toggle await ctx.send(f"Embeds for {bold(feed_name)} are {toggle_text}.") @rss.command(name="find") async def _rss_find(self, ctx, website_url: str): """ Attempts to find RSS feeds from a URL/website. The site must have identified their feed in the html of the page based on RSS feed type standards. """ async with ctx.typing(): timeout = aiohttp.ClientTimeout(total=20) async with aiohttp.ClientSession(headers=self._headers, timeout=timeout) as session: try: async with session.get(website_url) as response: soup = BeautifulSoup(await response.text(errors="replace"), "html.parser") except (aiohttp.client_exceptions.ClientConnectorError, aiohttp.client_exceptions.ClientPayloadError): await ctx.send("I can't reach that website.") return except aiohttp.client_exceptions.InvalidURL: await ctx.send( "That seems to be an invalid URL. Use a full website URL like `https://www.site.com/`." ) return except aiohttp.client_exceptions.ServerDisconnectedError: await ctx.send("The server disconnected early without a response.") return except asyncio.exceptions.TimeoutError: await ctx.send("The site didn't respond in time or there was no response.") return except Exception as e: msg = "There was an issue trying to find a feed in that site. " msg += "Please check your console for more information." log.exception(e, exc_info=e) await ctx.send(msg) return if "403 Forbidden" in soup.get_text(): await ctx.send("I received a '403 Forbidden' message while trying to reach that site.") return if not soup: await ctx.send("I didn't find anything at all on that link.") return msg = "" url_parse = urlparse(website_url) base_url = url_parse.netloc url_scheme = url_parse.scheme feed_url_types = ["application/rss+xml", "application/atom+xml", "text/xml", "application/rdf+xml"] for feed_type in feed_url_types: possible_feeds = soup.find_all("link", rel="alternate", type=feed_type, href=True) for feed in possible_feeds: feed_url = feed.get("href", None) ls_feed_url = feed_url.lstrip("/") if not feed_url: continue if feed_url.startswith("//"): final_url = f"{url_scheme}:{feed_url}" elif (not ls_feed_url.startswith(url_scheme)) and (not ls_feed_url.startswith(base_url)): final_url = f"{url_scheme}://{base_url}/{ls_feed_url}" elif ls_feed_url.startswith(base_url): final_url = f"{url_scheme}://{base_url}" else: final_url = feed_url msg += f"[Feed Title]: {feed.get('title', None)}\n" msg += f"[Feed URL]: {final_url}\n\n" if msg: await ctx.send(box(msg, lang="ini")) else: await ctx.send("No RSS feeds found in the link provided.") @rss.command(name="force") async def _rss_force(self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None): """Forces a feed alert.""" channel = channel or ctx.channel channel_permission_check = await self._check_channel_permissions(ctx, channel) if not channel_permission_check: return feeds = await self.config.all_channels() try: feeds[channel.id] except KeyError: await ctx.send("There are no feeds in this channel.") return if feed_name not in feeds[channel.id]["feeds"]: await ctx.send("That feed name doesn't exist in this channel.") return rss_feed = feeds[channel.id]["feeds"][feed_name] await self.get_current_feed(channel, feed_name, rss_feed, force=True) @rss.command(name="limit") async def _rss_limit( self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None, character_limit: int = None ): """ Set a character limit for feed posts. Use 0 for unlimited. RSS posts are naturally split at around 2000 characters to fit within the Discord character limit per message. If you only want the first embed or first message in a post feed to show, use 2000 or less characters for this setting. Note that this setting applies the character limit to the entire post, for all template values on the feed together. For example, if the template is `$title\\n$content\\n$link`, and title + content + link is longer than the limit, the link will not show. """ extra_msg = "" if character_limit is None: await ctx.send_help() return if character_limit < 0: await ctx.send("Character limit cannot be less than zero.") return if character_limit > 20000: character_limit = 0 if 0 < character_limit < 20: extra_msg = "Character limit has a 20 character minimum.\n" character_limit = 20 channel = channel or ctx.channel rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: await ctx.send("That feed name doesn't exist in this channel.") return async with self.config.channel(channel).feeds() as feed_data: feed_data[feed_name]["limit"] = character_limit characters = f"approximately {character_limit}" if character_limit > 0 else "an unlimited amount of" await ctx.send(f"{extra_msg}Character limit for {bold(feed_name)} is now {characters} characters.") @rss.command(name="list") async def _rss_list(self, ctx, channel: GuildMessageable = None): """List saved feeds for this channel or a specific channel.""" channel = channel or ctx.channel channel_permission_check = await self._check_channel_permissions(ctx, channel) if not channel_permission_check: return feeds = await self._get_feed_names(channel) msg = f"[ Available Feeds for #{channel.name} ]\n\n\t" if feeds: msg += "\n\t".join(sorted(feeds)) else: msg += "\n\tNone." for page in pagify(msg, delims=["\n"], page_length=1800): await ctx.send(box(page, lang="ini")) @rss.command(name="listall") async def _rss_listall(self, ctx): """List all saved feeds for this server.""" all_channels = await self.config.all_channels() all_guild_channels = [x.id for x in itertools.chain(ctx.guild.channels, ctx.guild.threads)] msg = "" for channel_id, data in all_channels.items(): if channel_id in all_guild_channels: channel_obj = ctx.guild.get_channel_or_thread(channel_id) feeds = await self._get_feed_names(channel_obj) if not feeds: continue if feeds == ["None."]: continue msg += f"[ Available Feeds for #{channel_obj.name} ]\n\n\t" msg += "\n\t".join(sorted(feeds)) msg += "\n\n" for page in pagify(msg, delims=["\n\n", "\n"], page_length=1800): await ctx.send(box(page, lang="ini")) @rss.command(name="listtags") async def _rss_list_tags(self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None): """List the tags available from a specific feed.""" channel = channel or ctx.channel channel_permission_check = await self._check_channel_permissions(ctx, channel) if not channel_permission_check: return rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: await ctx.send("No feed with that name in this channel.") return async with ctx.typing(): await self._rss_list_tags_helper(ctx, rss_feed, feed_name) async def _rss_list_tags_helper(self, ctx, rss_feed: dict, feed_name: str): """Helper function for rss listtags.""" msg = f"[ Available Template Tags for {feed_name} ]\n\n\t" feedparser_obj = await self._fetch_feedparser_object(rss_feed["url"]) if not feedparser_obj: await ctx.send("Couldn't fetch that feed.") return if feedparser_obj.entries: # this feed has posts feedparser_plus_obj = await self._add_to_feedparser_object(feedparser_obj.entries[0], rss_feed["url"]) else: # this feed does not have posts, but it has a header with channel information feedparser_plus_obj = await self._add_to_feedparser_object(feedparser_obj.feed, rss_feed["url"]) for tag_name, tag_content in sorted(feedparser_plus_obj.items()): if tag_name in INTERNAL_TAGS: # these tags attached to the rss feed object are for internal handling options continue tag_content_check = await self._get_tag_content_type(tag_content) if tag_content_check == TagType.HTML: msg += f"[X] ${tag_name}\n\t" elif tag_content_check == TagType.DICT: msg += f"[\\] ${tag_name} \n\t" elif tag_content_check == TagType.LIST: msg += f"[-] ${tag_name} \n\t" elif tag_name in feedparser_plus_obj["is_special"]: msg += f"[*] ${tag_name} \n\t" else: msg += f"[ ] ${tag_name} \n\t" msg += "\n\n\t[X] = html | [\\] = dictionary | [-] = list | [ ] = plain text" msg += "\n\t[*] = specially-generated tag, may not be present in every post" for msg_part in pagify(msg, delims=["\n\t", "\n\n"]): await ctx.send(box(msg_part, lang="ini")) @checks.is_owner() @rss.group(name="parse") async def _rss_parse(self, ctx): """ Change feed parsing for a specfic domain. This is a global change per website. The default is to use the feed's updated_parsed tag, and adding a website to this list will change the check to published_parsed. Some feeds may spam feed entries as they are updating the updated_parsed slot on their feed, but not updating feed content. In this case we can force specific sites to use the published_parsed slot instead by adding the website to this override list. """ pass @_rss_parse.command(name="add") async def _rss_parse_add(self, ctx, website_url: str): """ Add a website to the list for a time parsing override. Use a website link formatted like `www.website.com` or `https://www.website.com`. For more information, use `[p]help rss parse`. """ website = self._find_website(website_url) if not website: msg = f"I can't seem to find a website in `{website_url}`. " msg += "Use something like `https://www.website.com/` or `www.website.com`." await ctx.send(msg) return override_list = await self.config.use_published() if website in override_list: await ctx.send(f"`{website}` is already in the parsing override list.") else: override_list.append(website) await self.config.use_published.set(override_list) await ctx.send(f"`{website}` was added to the parsing override list.") @_rss_parse.command(name="list") async def _rss_parse_list(self, ctx): """ Show the list for time parsing overrides. For more information, use `[p]help rss parse`. """ override_list = await self.config.use_published() if not override_list: msg = "No site overrides saved." else: msg = "Active for:\n" + "\n".join(override_list) await ctx.send(box(msg)) @_rss_parse.command(name="remove", aliases=["delete", "del"]) async def _rss_parse_remove(self, ctx, website_url: str = None): """ Remove a website from the list for a time parsing override. Use a website link formatted like `www.website.com` or `https://www.website.com`. For more information, use `[p]help rss parse`. """ website = self._find_website(website_url) override_list = await self.config.use_published() if website in override_list: override_list.remove(website) await self.config.use_published.set(override_list) await ctx.send(f"`{website}` was removed from the parsing override list.") else: await ctx.send(f"`{website}` isn't in the parsing override list.") @rss.command(name="remove", aliases=["delete", "del"]) async def _rss_remove(self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None): """ Removes a feed from a channel. Defaults to the current channel if no channel is specified. """ channel = channel or ctx.channel channel_permission_check = await self._check_channel_permissions(ctx, channel, addl_send_messages_check=False) if not channel_permission_check: return success = await self._delete_feed(ctx, feed_name, channel) if success: await ctx.send("Feed deleted.") else: await ctx.send("Feed not found!") @rss.command(name="showtemplate") async def _rss_show_template(self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None): """Show the template in use for a specific feed.""" channel = channel or ctx.channel channel_permission_check = await self._check_channel_permissions(ctx, channel) if not channel_permission_check: return rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: await ctx.send("No feed with that name in this channel.") return space = "\N{SPACE}" embed_toggle = f"[ ] Embed:{space*16}Off" if not rss_feed["embed"] else f"[X] Embed:{space*16}On" embed_image = ( f"[ ] Embed image tag:{space*6}None" if not rss_feed["embed_image"] else f"[X] Embed image tag:{space*6}${rss_feed['embed_image']}" ) embed_thumbnail = ( f"[ ] Embed thumbnail tag:{space*2}None" if not rss_feed["embed_thumbnail"] else f"[X] Embed thumbnail tag:{space*2}${rss_feed['embed_thumbnail']}" ) hex_color = rss_feed.get("embed_color", None) if hex_color: color_name = await Color()._hex_to_css3_name(hex_color) hex_color = hex_color.lstrip("0x") embed_color = ( f"[ ] Embed hex color:{space*6}None" if not hex_color else f"[X] Embed hex color:{space*6}{hex_color} ({color_name})" ) allowed_tags = rss_feed.get("allowed_tags", []) if not allowed_tags: tag_msg = "[ ] No restrictions\n\tAll tags are allowed." else: tag_msg = "[X] Feed is restricted to posts that include:" for tag in allowed_tags: tag_msg += f"\n\t{await self._title_case(tag)}" character_limit = rss_feed.get("limit", 0) if character_limit == 0: length_msg = "[ ] Feed length is unlimited." else: length_msg = f"[X] Feed length is capped at {character_limit} characters." embed_settings = f"{embed_toggle}\n{embed_color}\n{embed_image}\n{embed_thumbnail}" rss_template = rss_feed["template"].replace("\n", "\\n").replace("\t", "\\t") msg = f"Template for {bold(feed_name)}:\n\n`{rss_template}`\n\n{box(embed_settings, lang='ini')}\n{box(tag_msg, lang='ini')}\n{box(length_msg, lang='ini')}" for page in pagify(msg, delims=["\n"], page_length=1800): await ctx.send(page) @rss.group(name="tag") async def _rss_tag(self, ctx): """RSS post tag qualification.""" pass @_rss_tag.command(name="allow") async def _rss_tag_allow(self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None, *, tag: str = None): """ Set an allowed tag for a feed to be posted. The tag must match exactly (without regard to title casing). No regex or placeholder qualification. Tags can be found in `[p]rss listtags` under `$tags` or `$tags_list` (if tags are present in the feed - not all feeds have tags). """ channel = channel or ctx.channel rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: await ctx.send("That feed name doesn't exist in this channel.") return async with self.config.channel(channel).feeds() as feed_data: allowed_tags = feed_data[feed_name].get("allowed_tags", []) if tag.lower() in [x.lower() for x in allowed_tags]: return await ctx.send( f"{bold(await self._title_case(tag))} is already in the allowed list for {bold(feed_name)}." ) allowed_tags.append(tag.lower()) feed_data[feed_name]["allowed_tags"] = allowed_tags await ctx.send( f"{bold(await self._title_case(tag))} was added to the list of allowed tags for {bold(feed_name)}. " "If a feed post's `$tags` does not include this value, the feed will not post." ) @_rss_tag.command(name="allowlist") async def _rss_tag_allowlist(self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None): """ List allowed tags for feed post qualification. """ channel = channel or ctx.channel rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: await ctx.send("That feed name doesn't exist in this channel.") return msg = f"[ Allowed Tags for {feed_name} ]\n\n\t" allowed_tags = rss_feed.get("allowed_tags", []) if not allowed_tags: msg += "All tags are allowed." else: for tag in allowed_tags: msg += f"{await self._title_case(tag)}\n" await ctx.send(box(msg, lang="ini")) @_rss_tag.command(name="remove", aliases=["delete"]) async def _rss_tag_remove( self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None, *, tag: str = None ): """ Remove a tag from the allow list. The tag must match exactly (without regard to title casing). No regex or placeholder qualification. """ channel = channel or ctx.channel rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: await ctx.send("That feed name doesn't exist in this channel.") return async with self.config.channel(channel).feeds() as feed_data: allowed_tags = feed_data[feed_name].get("allowed_tags", []) try: allowed_tags.remove(tag.lower()) feed_data[feed_name]["allowed_tags"] = allowed_tags await ctx.send( f"{bold(await self._title_case(tag))} was removed from the list of allowed tags for {bold(feed_name)}." ) except ValueError: await ctx.send( f"{bold(await self._title_case(tag))} was not found in the allow list for {bold(feed_name)}." ) @rss.command(name="template") async def _rss_template( self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None, *, template: str = None ): """ Set a template for the feed alert. Each variable must start with $, valid variables can be found with `[p]rss listtags`. """ channel = channel or ctx.channel channel_permission_check = await self._check_channel_permissions(ctx, channel) if not channel_permission_check: return if not template: await ctx.send_help() return template = template.replace("\\t", "\t") template = template.replace("\\n", "\n") success = await self._edit_template(ctx, feed_name, channel, template) if success: await ctx.send("Template added successfully.") else: await ctx.send("Feed not found!") @rss.command(name="viewtags") async def _rss_view_tags(self, ctx, feed_name: str, channel: Optional[GuildMessageable] = None): """View a preview of template tag content available from a specific feed.""" channel = channel or ctx.channel channel_permission_check = await self._check_channel_permissions(ctx, channel) if not channel_permission_check: return rss_feed = await self.config.channel(channel).feeds.get_raw(feed_name, default=None) if not rss_feed: await ctx.send("No feed with that name in this channel.") return async with ctx.typing(): await self._rss_view_tags_helper(ctx, rss_feed, feed_name) async def _rss_view_tags_helper(self, ctx, rss_feed: dict, feed_name: str): """Helper function for rss viewtags.""" blue_ansi_prefix = "\u001b[1;40;34m" reset_ansi_prefix = "\u001b[0m" msg = f"{blue_ansi_prefix}[ Template Tag Content Preview for {feed_name} ]{reset_ansi_prefix}\n\n\t" feedparser_obj = await self._fetch_feedparser_object(rss_feed["url"]) if not feedparser_obj: await ctx.send("Couldn't fetch that feed.") return if feedparser_obj.entries: # this feed has posts feedparser_plus_obj = await self._add_to_feedparser_object(feedparser_obj.entries[0], rss_feed["url"]) else: # this feed does not have posts, but it has a header with channel information feedparser_plus_obj = await self._add_to_feedparser_object(feedparser_obj.feed, rss_feed["url"]) longest_key = max(feedparser_plus_obj, key=len) longest_key_len = len(longest_key) for tag_name, tag_content in sorted(feedparser_plus_obj.items()): if tag_name in INTERNAL_TAGS: # these tags attached to the rss feed object are for internal handling options continue tag_content = str(tag_content).replace("[", "").replace("]", "").replace("\n", " ").replace('"', "") tag_content = tag_content.lstrip(" ") space = "\N{SPACE}" tag_name_padded = ( f"{blue_ansi_prefix}${tag_name}{reset_ansi_prefix}{space*(longest_key_len - len(tag_name))}" ) if len(tag_content) > 50: tag_content = tag_content[:50] + "..." msg += f"{tag_name_padded} {tag_content}\n\t" for msg_part in pagify(msg, delims=["\n\t", "\n\n"], page_length=1900): await ctx.send(box(msg_part.rstrip("\n\t"), lang="ansi")) @rss.command(name="version", hidden=True) async def _rss_version(self, ctx): """Show the RSS version.""" await ctx.send(f"RSS version {__version__}") async def get_current_feed(self, channel: GuildMessageable, name: str, rss_feed: dict, *, force: bool = False): """Takes an RSS feed and builds an object with all extra tags""" log.debug(f"getting feed {name} on cid {channel.id}") url = rss_feed["url"] last_title = rss_feed["last_title"] # last_link is a get for feeds saved before RSS 1.1.5 which won't have this attrib till it's checked once last_link = rss_feed.get("last_link", None) # last_time is a get for feeds saved before RSS 1.1.7 which won't have this attrib till it's checked once last_time = rss_feed.get("last_time", None) template = rss_feed["template"] message = None feedparser_obj = await self._fetch_feedparser_object(url) if not feedparser_obj: return try: log.debug(f"{feedparser_obj.error} Channel: {channel.id}") return except AttributeError: pass # sorting the entire feedparser object by updated_parsed time if it exists, if not then published_parsed # certain feeds can be rearranged by a user, causing all posts to be out of sequential post order # or some feeds are out of time order by default if feedparser_obj.entries: # this feed has posts sorted_feed_by_post_time = await self._sort_by_post_time(feedparser_obj.entries) else: # this feed does not have posts, but it has a header with channel information sorted_feed_by_post_time = [feedparser_obj.feed] if not force: entry_time = await self._time_tag_validation(sorted_feed_by_post_time[0]) if (last_time and entry_time) is not None: if last_time > entry_time: log.debug("Not posting because new entry is older than last saved entry.") return try: title = sorted_feed_by_post_time[0].title except AttributeError: title = "" try: link = sorted_feed_by_post_time[0].link except AttributeError: link = "" await self._update_last_scraped(channel, name, title, link, entry_time) feedparser_plus_objects = [] for entry in sorted_feed_by_post_time: # sometimes there's no title or no link attribute and feedparser doesn't really play nice with that try: entry_title = entry.title except AttributeError: entry_title = "" try: entry_link = entry.link except AttributeError: entry_link = "" # find the updated_parsed (checked first) or an published_parsed tag if they are present entry_time = await self._time_tag_validation(entry) # we only need one feed entry if this is from rss force if force: feedparser_plus_obj = await self._add_to_feedparser_object(entry, url) feedparser_plus_objects.append(feedparser_plus_obj) break # TODO: spammy debug logs to vvv # there's a post time to compare elif (entry_time and last_time) is not None: # this is a post with an updated time with the same link and title, maybe an edited post. # if a feed is spamming updated times with no content update, consider adding the full website # (www.website.com) to the rss parse command if (last_title == entry_title) and (last_link == entry_link) and (last_time < entry_time): log.debug(f"New update found for an existing post in {name} on cid {channel.id}") feedparser_plus_obj = await self._add_to_feedparser_object(entry, url) feedparser_plus_objects.append(feedparser_plus_obj) else: # a post from the future, or we are caught up if last_time >= entry_time: log.debug(f"Up to date on {name} on cid {channel.id}") break # a new post if last_link != entry_link: log.debug(f"New entry found via time and link validation for feed {name} on cid {channel.id}") feedparser_plus_obj = await self._add_to_feedparser_object(entry, url) feedparser_plus_objects.append(feedparser_plus_obj) else: # I don't belive this ever should be hit but this is a catch to debug # a feed in case one ever appears that does this log.debug( f"*** This post qualified via timestamp check but has the same link as last: {entry_title[:25]} | {entry_link}" ) # this is a post that has no time comparison information because one or both timestamps are None. # compare the title and link to see if it's the same post as previous. # this may need more definition in the future if there is a feed that provides new titles but not new links etc elif entry_time is None or last_time is None: if last_title == entry_title and last_link == entry_link: log.debug(f"Up to date on {name} on {channel.id} via link match, no time to compare") break else: log.debug(f"New entry found for feed {name} on cid {channel.id} via new link or title") feedparser_plus_obj = await self._add_to_feedparser_object(entry, url) feedparser_plus_objects.append(feedparser_plus_obj) # we found a match for a previous feed post else: log.debug( f"Breaking rss entry loop for {name} on {channel.id}, we found where we are supposed to be caught up to" ) break # TODO: fix rss losing its place on on store.steampowered.com feeds/post lists if len(feedparser_plus_objects) == len(sorted_feed_by_post_time): msg = (f"Couldn't match anything for feed {name} on cid {channel.id}, or switching between feed header and feed entry, only posting 1 post") log.debug(msg) feedparser_plus_objects = [feedparser_plus_objects[0]] if not feedparser_plus_objects: # early-exit so that we don't dispatch when there's no updates return # post oldest first feedparser_plus_objects.reverse() # list of feedparser_plus_objects wrapped in MappingProxyType # filled during the loop below proxied_dicts = [] sent_message = False for feedparser_plus_obj in feedparser_plus_objects: try: curr_title = feedparser_plus_obj.title except AttributeError: curr_title = "" except IndexError: log.debug(f"No entries found for feed {name} on cid {channel.id}") return # allowed tag verification section allowed_tags = rss_feed.get("allowed_tags", []) if len(allowed_tags) > 0: allowed_post_tags = [x.lower() for x in allowed_tags] feed_tag_list = [x.lower() for x in feedparser_plus_obj.get("tags_list", [])] intersection = list(set(feed_tag_list).intersection(allowed_post_tags)) if len(intersection) == 0: log.debug( f"{name} feed post in {channel.name} ({channel.id}) was denied because of an allowed tag mismatch." ) continue # starting to fill out the template for feeds that passed tag verification (if present) to_fill = QuietTemplate(template) message = to_fill.quiet_safe_substitute(name=bold(name), **feedparser_plus_obj) if len(message.strip(" ")) == 0: message = None if not message: log.debug(f"{name} feed in {channel.name} ({channel.id}) has no valid tags, not posting anything.") return embed_toggle = rss_feed["embed"] red_embed_settings = await self.bot.embed_requested(channel) rss_limit = rss_feed.get("limit", 0) if rss_limit > 0: # rss_limit needs + 8 characters for pagify counting codeblock characters message = list(pagify(message, delims=["\n", " "], priority=True, page_length=(rss_limit + 8)))[0] if embed_toggle and red_embed_settings: await self._get_current_feed_embed(channel, rss_feed, feedparser_plus_obj, message) else: for page in pagify(message, delims=["\n"]): await channel.send(page) sent_message = True # This event can be used in 3rd-party using listeners. # This may (and most likely will) get changes in the future # so I suggest accepting **kwargs in the listeners using this event. # # channel: Union[discord.TextChannel, discord.VoiceChannel, discord.StageChannel, discord.Thread] # The channel feed alert went to. # feed_data: Mapping[str, Any] # Read-only mapping with feed's data. # The available data depends on what this cog needs # and there most likely will be changes here in future. # Available keys include: `name`, `template`, `url`, `embed`, etc. # feedparser_dict: Mapping[str, Any] # Read-only mapping with parsed data from the feed. # See documentation of feedparser.FeedParserDict for more information. # force: bool # True if the update was forced (through `[p]rss force`), False otherwise. feedparser_dict_proxy = MappingProxyType(feedparser_plus_obj) proxied_dicts.append(feedparser_dict_proxy) self.bot.dispatch( "aikaternacogs_rss_message", channel=channel, feed_data=MappingProxyType(rss_feed), feedparser_dict=feedparser_dict_proxy, force=force, ) if not sent_message: return # This event can be used in 3rd-party using listeners. # This may (and most likely will) get changes in the future # so I suggest accepting **kwargs in the listeners using this event. # # channel: Union[discord.TextChannel, discord.VoiceChannel, discord.StageChannel, discord.Thread] # The channel feed alerts went to. # feed_data: Mapping[str, Any] # Read-only mapping with feed's data. # The available data depends on what this cog needs # and there most likely will be changes here in future. # Available keys include: `name`, `template`, `url`, `embed`, etc. # feedparser_dicts: List[Mapping[str, Any]] # List of read-only mappings with parsed data # from each **new** entry in the feed. # See documentation of feedparser.FeedParserDict for more information. # force: bool # True if the update was forced (through `[p]rss force`), False otherwise. self.bot.dispatch( "aikaternacogs_rss_feed_update", channel=channel, feed_data=MappingProxyType(rss_feed), feedparser_dicts=proxied_dicts, force=force, ) async def _get_current_feed_embed( self, channel: GuildMessageable, rss_feed: dict, feedparser_plus_obj: feedparser.util.FeedParserDict, message: str, ): embed_list = [] for page in pagify(message, delims=["\n"]): embed = discord.Embed(description=page) if rss_feed["embed_color"]: color = int(rss_feed["embed_color"], 16) embed.color = discord.Color(color) embed_list.append(embed) if len(embed_list) == 0: return # Add published timestamp to the last footer if it exists time_tags = ["updated_parsed_datetime", "published_parsed_datetime"] for time_tag in time_tags: try: published_time = feedparser_plus_obj[time_tag] embed = embed_list[-1] embed.timestamp = published_time break except KeyError: pass # Add embed image to last embed if it's set try: embed_image_tag = rss_feed["embed_image"] embed_image_url = feedparser_plus_obj[embed_image_tag] img_type = await self._validate_image(embed_image_url) if img_type in VALID_IMAGES: embed = embed_list[-1] embed.set_image(url=embed_image_url) except KeyError: pass # Add embed thumbnail to first embed if it's set try: embed_thumbnail_tag = rss_feed["embed_thumbnail"] embed_thumbnail_url = feedparser_plus_obj[embed_thumbnail_tag] img_type = await self._validate_image(embed_thumbnail_url) if img_type in VALID_IMAGES: embed = embed_list[0] embed.set_thumbnail(url=embed_thumbnail_url) except KeyError: pass for embed in embed_list: await channel.send(embed=embed) async def read_feeds(self): """Feed poster loop.""" await self.bot.wait_until_red_ready() await self._put_feeds_in_queue() self._post_queue_size = self._post_queue.qsize() # TODO: very large queues with a lot of RSS feeds (1000+) cause this to fall behind while True: try: queue_item = await self._get_next_in_queue() if not queue_item: # the queue is empty config_data = await self.config.all_channels() if not config_data: # nothing to check log.debug(f"Sleeping, nothing to do") await asyncio.sleep(30) continue if self._post_queue_size < 300: # less than 300 entries to check means 1/sec check times # the wait is (5 min - entry count) before posting again wait = 300 - self._post_queue_size else: # more than 300 entries means we used the whole 5 min # to check and post feeds so don't wait any longer to start again wait = 0 log.debug(f"Waiting {wait}s before starting...") await asyncio.sleep(wait) await self._put_feeds_in_queue() if self._post_queue.qsize() > self._post_queue_size: # there's been more feeds added so let's update the total size # so feeds have the proper wait time @ > 300 feeds log.debug(f"Updating total queue size to {self._post_queue.qsize()}") self._post_queue_size = self._post_queue.qsize() continue else: try: # queue_item is a List of channel_priority: int, total_priority: int, queue_item: SimpleNamespace await self.get_current_feed( queue_item[2].channel, queue_item[2].feed_name, queue_item[2].feed_data ) except aiohttp.client_exceptions.InvalidURL as e: log.debug(f"Feed at {e.url} is bad or took too long to respond.") continue if self._post_queue_size < 300: wait = 1 else: wait = (300 - 10) / self._post_queue_size log.debug(f"sleeping for {wait}...") await asyncio.sleep(wait) except asyncio.CancelledError: break except Exception as e: log.error("An error has occurred in the RSS cog. Please report it.", exc_info=e) continue async def _put_feeds_in_queue(self): log.debug("Putting feeds in queue") try: config_data = await self.config.all_channels() total_index = 0 for channel_id, channel_feed_list in config_data.items(): channel = await self._get_channel_object(channel_id) if not channel: continue if await self.bot.cog_disabled_in_guild(self, channel.guild): continue for feed_key, feed in channel_feed_list.items(): for feed_name, feed_data in feed.items(): rss_feed = SimpleNamespace(channel=channel, feed_name=feed_name, feed_data=feed_data) keys = list(feed.keys()) channel_index = keys.index(feed_name) total_index += 1 queue_entry = [channel_index, total_index, rss_feed] log.debug(f"Putting {channel_index}-{total_index}-{channel}-{feed_name} in queue") await self._post_queue.put(queue_entry) except Exception as e: log.exception(e, exc_info=e) async def _get_next_in_queue(self): try: to_check = self._post_queue.get_nowait() except asyncio.queues.QueueEmpty: return None return to_check class NoFeedContent(Exception): def __init__(self, m): self.message = m def __str__(self): return self.message