# -*- coding: utf-8 -*- # Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://architizer.com/""" from .common import GalleryExtractor, Extractor, Message from .. import text class ArchitizerProjectExtractor(GalleryExtractor): """Extractor for project pages on architizer.com""" category = "architizer" subcategory = "project" root = "https://architizer.com" directory_fmt = ("{category}", "{firm}", "{title}") filename_fmt = "{filename}.{extension}" archive_fmt = "{gid}_{num}" pattern = r"(?:https?://)?architizer\.com/projects/([^/?#]+)" example = "https://architizer.com/projects/NAME/" def __init__(self, match): url = f"{self.root}/projects/{match[1]}/" GalleryExtractor.__init__(self, match, url) def metadata(self, page): extr = text.extract_from(page) extr('id="Pages"', "") return { "title" : extr("data-name='", "'"), "slug" : extr("data-slug='", "'"), "gid" : extr("data-gid='", "'").rpartition(".")[2], "firm" : extr("data-firm-leaders-str='", "'"), "location" : extr("

", "<").strip(), "type" : text.unescape(text.remove_html(extr( '
Type
', 'STATUS', 'YEAR', 'SIZE', '', '') .replace("
", "\n")), } def images(self, page): return [ (url, None) for url in text.extract_iter( page, 'property="og:image:secure_url" content="', "?") ] class ArchitizerFirmExtractor(Extractor): """Extractor for all projects of a firm""" category = "architizer" subcategory = "firm" root = "https://architizer.com" pattern = r"(?:https?://)?architizer\.com/firms/([^/?#]+)" example = "https://architizer.com/firms/NAME/" def init(self, match): Extractor.init(self, match) self.firm = match[1] def items(self): url = url = f"{self.root}/firms/{self.firm}/?requesting_merlin=pages" page = self.request(url).text data = {"_extractor": ArchitizerProjectExtractor} for project in text.extract_iter(page, '