
    i                      F    S r SSKJrJr  SSKJrJr  SSKr " S S\5      r	g)	zGeneric information extractor   )	ExtractorMessage   )configtext    Nc                       \ rS rSrSrSrSrSrSr\	R                  " SS5      (       a  \S	-  r\S
-  rSrS rS rS rS rSrg)GenericExtractor   z+Extractor for images in a generic web page.generic)z
{category}z{subcategory}z{path}z
{imageurl}z(?i)(?P<generic>g(?:eneric)?:))	extractorr   enabled?zj(?P<scheme>https?://)?(?P<domain>[-\w\.]+)(?P<path>/[^?#]*)?(?:\?(?P<query>[^#]*))?(?:\#(?P<fragment>.*))?z$generic:https://www.nongnu.org/lzip/c                    US   U l         [        R                  " X5        US   (       a  US   R                  S5      S   U l        O%U R
                  R                  S5        US   U l        US   (       a  US   U l        O7SU l        [        R                  " U R                  U R                  5      U l        US	   U l
        U R                  US   -   U l        g )
Ndomainr   r   :r   z.Falling back on generic information extractor.schemezhttps://path)subcategoryr   __init__	partitionurlloginfor   r   ensure_http_schemer   root)selfmatchs     I/app/mltbenv/lib/python3.13/site-packages/gallery_dl/extractor/generic.pyr   GenericExtractor.__init__&   s     ?4' Qx))#.q1DHHHMMJKQxDH ?/DK$DK..txxEDH&M	 KK%/1	    c              #     #    U R                  U R                  5      R                  nU R                  U5      nU R	                  U5      n [        U5      US'   [        US5      n[        R                  SU4v   U Hj  u  US'   u  pVU(       a.  UR                  U5        SU;  a  [        R                  " XR5        O[        R                  " XR5        [        R                  XR4v   Ml     g! [         a     Nf = f7f)ztGet page, extract metadata & images, yield them in suitable messages

Adapted from common.GalleryExtractor.items()

countr    num	extensionN)requestr   r   metadataimageslen	TypeError	enumerater   	Directoryupdatenameext_from_urlUrl)r   pagedataimgsr)   r   imgdatas          r   itemsGenericExtractor.items>   s      ||DHH%**}}T"{{4 	IDM 4#T))+1'DK#G$g-))#4%%c0++s(( ,2  		s+   AC;C+ BC;+
C85C;7C88C;c                    [         R                  " USS5      [         R                  " USS5      [         R                  " USS5      [         R                  " USS5      [         R                  " USS5      [         R                  " USS5      [         R                  " US	S5      [         R                  " US
S5      [         R                  " USS5      [         R                  " USS5      S.
nUR                  5        VVs0 s H%  u  p4U(       d  M  U[         R                  " U5      _M'     nnnU R                  R                  SS5      US'   U R                  US'   U$ s  snnf )z8Extract generic webpage metadata, return them in a dict.z<title>z</title>z"<meta name="description" content=""z<meta name="keywords" content="z<meta name="language" content="z<meta itemprop="name" content="z <meta name="copyright" content="z"<meta property="og:site" content="z'<meta property="og:site_name" content="z#<meta property="og:title" content="z)<meta property="og:description" content=")
titledescriptionkeywordslanguagename	copyrightog_siteog_site_nameog_titleog_description/r$   r   pageurl)r   extrr5   unescaper   replacer   )r   r1   r2   kvs        r   r(   GenericExtractor.metadataY   s:    #iii-"ii:CA"ii7>"ii7>"ii7>"ii8#?"ii:CA"ii?F"ii;SB"iiA3H'
0 15

B#4==##Byy((b1V((Y	 Cs   =EEc                    Sn Sn[         R                  " U5      R                  U5      n[         R                  " U5      R                  U5      nXE-   n[         R                  " S5      R                  U5      nU(       a  US   R	                  S5      U l        OoU R                  R                  S5      (       a!  U R                  R	                  S5      U l        O.[        R                  R                  U R                  5      U l        / nU H  n	U	R                  S5      (       a  UR                  U	5        M,  U	R                  S5      (       a/  UR                  U R                  U	R                  S5      -   5        Mq  U	R                  S5      (       a   UR                  U R                  U	-   5        M  UR                  U R
                  S-   U	-   5        M     [         R#                  U5      nU V	s/ s H  oSU	04PM
     n
n	U
$ s  sn	f )	a  Extract image urls, return a list of (image url, metadata) tuples.

The extractor aims at finding as many _likely_ image urls as possible,
using two strategies (see below); since these often overlap, any
duplicate urls will be removed at the end of the process.

Note: since we are using re.findall() (see below), it's essential that
the following patterns contain 0 or at most 1 capturing group, so that
re.findall() return a list of urls (instead of a list of tuples of
matching groups). All other groups used in the pattern should be
non-capturing (?:...).

1: Look in src/srcset attributes of img/video/source elements

See:
https://www.w3schools.com/tags/att_src.asp
https://www.w3schools.com/tags/att_source_srcset.asp

We allow both absolute and relative urls here.

Note that srcset attributes often contain multiple space separated
image urls; this pattern matches only the first url; remaining urls
will be matched by the "imageurl_pattern_ext" pattern below.
zE(?i)<(?:img|video|source)\s[^>]*src(?:set)?=[\"']?(?P<URL>[^\"'\s>]+)zY(?i)(?:[^?&#\"'>\s]+)\.(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus)(?:[^\"'<>\s]*)?z/(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)r   rC   httpz//imageurl)r   refindallsearchrstripbaseurlr   endswithosr   dirname
startswithappendr   lstripr   dictfromkeys)r   r1   imageurl_pattern_srcimageurl_pattern_extimageurls_srcimageurls_ext	imageurls	basematchabsimageurlsur)   s              r   r)   GenericExtractor.imagesy   s   6# 	
	  	  45==dC 45==dC!1	 GG>@@Ft 	$U+2237DL xx  %%#xxs3!wwtxx8 A||F####A&d####DKK!((3-$?@c""##DIIM2 ##DLL3$6$:;  }}\2 1==1z1o&= >s   2H)rR   r   r   r   r   r   N)__name__
__module____qualname____firstlineno____doc__categorydirectory_fmtarchive_fmtpatternr   getexampler   r5   r(   r)   __static_attributes__ r!   r   r
   r
      sd    5H=MK
 0Gzz*I664 	#G 5G20)6@ar!   r
   )
rh   commonr   r   r$   r   r   os.pathrT   r
   rp   r!   r   <module>rs      s!    $ &  Ly Lr!   