# -*- coding: utf-8 -*- # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for Postmill instances""" from .common import BaseExtractor, Message from .. import text, exception class PostmillExtractor(BaseExtractor): """Base class for Postmill extractors""" basecategory = "postmill" directory_fmt = ("{category}", "{instance}", "{forum}") filename_fmt = "{id}_{title[:220]}.{extension}" archive_fmt = "{filename}" def _init(self): self.instance = self.root.partition("://")[2] self.save_link_post_body = self.config("save-link-post-body", False) self._search_canonical_url = text.re(r"/f/([\w\d_]+)/(\d+)/").search self._search_image_tag = text.re( r'')) date = self.parse_datetime_iso(extr( '')) username = extr( '') post_canonical_url = text.unescape(extr( '')) url = text.unescape(extr( '

', '') match = self._search_canonical_url(post_canonical_url) forum = match[1] id = int(match[2]) is_text_post = (url[0] == "/") is_image_post = self._search_image_tag(page) is not None data = { "title": title, "date": date, "username": username, "forum": forum, "id": id, "flair": [text.unescape(i) for i in text.extract_iter( page, '', '')], "instance": self.instance, } urls = [] if is_text_post or self.save_link_post_body: urls.append((Message.Url, "text:" + body)) if is_image_post: urls.append((Message.Url, url)) elif not is_text_post: urls.append((Message.Queue, url)) data["count"] = len(urls) yield Message.Directory, "", data for data["num"], (msg, url) in enumerate(urls, 1): if url.startswith("text:"): data["filename"], data["extension"] = "", "htm" else: data = text.nameext_from_url(url, data) yield msg, url, data class PostmillSubmissionsExtractor(PostmillExtractor): """Base class for Postmill submissions extractors""" whitelisted_parameters = () def init(self, match): PostmillExtractor.init(self, match) groups = match.groups() self.base = groups[-3] self.sorting_path = groups[-2] or "" self.query = {key: value for key, value in text.parse_query( groups[-1]).items() if self.acceptable_query(key)} def items(self): url = f"{self.root}{self.base}{self.sorting_path}" while url: response = self.request(url, params=self.query) if response.history: redirect_url = response.url if redirect_url == self.root + "/login": raise exception.AbortExtraction( f"HTTP redirect to login page ({redirect_url})") page = response.text for nav in text.extract_iter(page, '
', '
'): post_url = text.unescape(text.extr(nav, '')) def acceptable_query(self, key): return key in self.whitelisted_parameters or key == "t" or \ (key.startswith("next[") and key.endswith("]")) BASE_PATTERN = PostmillExtractor.update({ "raddle": { "root" : None, "pattern": (r"(?:raddle\.me|" r"c32zjeghcp5tj3kb72pltz56piei66drc63vkhn5yixiyk4cmerrjtid" r"\.onion)"), } }) QUERY_RE = r"(?:\?([^#]+))?$" SORTING_RE = (r"(/(?:hot|new|active|top|controversial|most_commented))?" + QUERY_RE) class PostmillPostExtractor(PostmillExtractor): """Extractor for a single submission URL""" subcategory = "post" pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)" example = "https://raddle.me/f/FORUM/123/TITLE" def init(self, match): PostmillExtractor.init(self, match) self.forum = match[3] self.post_id = match[4] def post_urls(self): return (f"{self.root}/f/{self.forum}/{self.post_id}",) class PostmillShortURLExtractor(PostmillExtractor): """Extractor for short submission URLs""" subcategory = "shorturl" pattern = BASE_PATTERN + r"(/\d+)$" example = "https://raddle.me/123" def items(self): url = self.root + self.groups[2] location = self.request_location(url) full_url = text.urljoin(url, location) yield Message.Queue, full_url, {"_extractor": PostmillPostExtractor} class PostmillHomeExtractor(PostmillSubmissionsExtractor): """Extractor for the home page""" subcategory = "home" pattern = rf"{BASE_PATTERN}(/(?:featured|subscribed|all)?){SORTING_RE}" example = "https://raddle.me/" class PostmillForumExtractor(PostmillSubmissionsExtractor): """Extractor for submissions on a forum""" subcategory = "forum" pattern = rf"{BASE_PATTERN}(/f/\w+){SORTING_RE}" example = "https://raddle.me/f/FORUM" class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor): """Extractor for submissions made by a user""" subcategory = "usersubmissions" pattern = rf"{BASE_PATTERN}(/user/\w+/submissions)(){QUERY_RE}" example = "https://raddle.me/user/USER/submissions" class PostmillTagExtractor(PostmillSubmissionsExtractor): """Extractor for submissions on a forum with a specific tag""" subcategory = "tag" pattern = rf"{BASE_PATTERN}(/tag/\w+){SORTING_RE}" example = "https://raddle.me/tag/TAG" class PostmillSearchExtractor(PostmillSubmissionsExtractor): """Extractor for search results""" subcategory = "search" pattern = BASE_PATTERN + r"(/search)()\?(q=[^#]+)$" example = "https://raddle.me/search?q=QUERY" whitelisted_parameters = ("q",)