o
    Wi9                     @   sn  d Z ddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
 ddlmZmZmZ ddlZddlmZmZmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' e(  e)e*Z+e,dZ-de.e/B de.fddZ0d4dede1de.dB ddfddZ2	d5dedede3ddfddZ4		d6dedede.dB de3ddf
ddZ5				d7ded e.d!ed"e	e6 d#e.d$e3de.dB d%e.dB ddfd&d'Z7e#d(Z8e#d)Z9d e.d#e.d%e.ddfd*d+Z:					d8ded e.d!e
e6 d"e	e6 d#e.d$e3de.dB d%e.dB de3ddfd,d-Z;defd.d/Z<d4d0e=e. dB ddfd1d2Z>e*d3kr5e>  dS dS )9z#Extract pdf structure in XML format    N)ArgumentParser)	ContainerIterable)AnyTextIOcast)PDFDocumentPDFNoOutlinesPDFXRefFallback)
PDFIOErrorPDFObjectNotFoundPDFTypeErrorPDFValueError)PDFPage)	PDFParser)	PDFObjRef	PDFStreamresolve1stream_value)LIT	PSKeyword	PSLiteral)isnumberz&[\000-\037&<>()"\042\047\134\177-\377]sreturnc                 C   s(   t | tr
t| dn| }tdd |S )Nzlatin-1c                 S   s   dt | d dS )Nz&#r   ;)ordgroup)m r   X/var/www/Credit_scoring_API/venv310/lib/python3.10/site-packages/../../../bin/dumppdf.py<lambda>"   s    zescape.<locals>.<lambda>)
isinstancebytesstrESC_PATsub)r   usr   r   r    escape    s   r(   outobjcodecc                 C   s0  |d u r|  d d S t|trC|  dt| d | D ]\}}|  d| d |  d t| | |  d q|  d d S t|tri|  d	t| d |D ]}t| | |  d
 qU|  d d S t|ttfr|  dt| dt	| d d S t|t
r|dkr|  |  d S |dkr|  |  d S |  d t| |j |  d |dkr| }|  dt| dt	| d |  d d S t|tr|  d|j d d S t|tr|  d|j d d S t|tr|  d|j d d S t|r|  d| d d S t|)Nz<null />z<dict size="">
z<key>z</key>
z<value>z	</value>
z</dict>z<list size="
z</list>z<string size="z">z	</string>rawbinaryz<stream>
<props>
z

</props>
textz<data size="z</data>
z	</stream>z	<ref id="z" />z	<keyword>z
</keyword>z	<literal>z
</literal>z<number>z	</number>)writer"   dictlenitemsdumpxmllistr$   r#   r(   r   Zget_rawdataget_dataattrsr   objidr   namer   r   r   )r)   r*   r+   kvdatar   r   r    r5   %   sd   







 
	

 



r5   Fdocshow_fallback_xrefc                 C   sr   |j D ]}t|tr|r| d t| |  | d qtdd |j D }|r5|s7d}t| d S d S d S )Nz
<trailer>
z
</trailer>

c                 s   s    | ]}t |tV  qd S N)r"   r
   ).0xrefr   r   r    	<genexpr>p   s    zdumptrailers.<locals>.<genexpr>zThis PDF does not have an xref. Use --show-fallback-xref if you want to display the content of a fallback xref that contains all objects.)	xrefsr"   r
   r1   r5   Zget_trailerallloggerwarning)r)   r>   r?   rB   Zno_xrefsmsgr   r   r    dumptrailersf   s   


rI   c           	      C   s   t  }| d |jD ]N}| D ]G}||v rq|| z"||}|d u r)W q| d| d t| ||d | d W q tyX } ztd| W Y d }~qd }~ww qt	| || | d d S )Nz<pdf>z<object id="r,   r+   z
</object>

znot found: z</pdf>)
setr1   rD   
get_objidsaddgetobjr5   r   printrI   )	r)   r>   r+   r?   visitedrB   r9   r*   er   r   r    dumpallobjsz   s*   



rR    outfpfnameobjidspagenospassworddumpall
extractdirc              	      s  t |d}t|}	t|	| dd tt dD }
dtdtf fdd}z  }| 	d	 |D ]t\}}}}}d }|rJ||}|
|d
 j
 }n(|rr|}t|trr|d}|rrt|dkrr|drr||d }|
|d
 j
 }t|}| 	d|d| d |d ur| 	d t| | | 	d |d ur| 	d|d | 	d q3| 	d W n	 ty   Y nw |	  W d    d S 1 sw   Y  d S )Nrbc                 S   s   i | ]\}}|j |qS r   )Zpageid)rA   pagenopager   r   r    
<dictcomp>   s    zdumpoutline.<locals>.<dictcomp>   destr   c                    s`   t | ttfrt | } nt | trt | j} t | tr%| d } t | tr.| 	 } | S )ND)
r"   r$   r#   r   Zget_destr   r:   r2   r   resolve)r`   r>   r   r    resolve_dest   s   


z!dumpoutline.<locals>.resolve_destz<outlines>
r   Sz/'GoTo'ra   z<outline level="z	" title="r,   z<dest>z</dest>
z<pageno>z
</pageno>
z</outline>
z</outlines>
)openr   r   	enumerater   create_pagesobjectr   Zget_outlinesr1   r9   r"   r2   getreprr(   r5   r	   close)rT   rU   rV   rW   rX   rY   r+   rZ   fpparserZpagesrd   Zoutlinesleveltitler`   aZ_ser\   actionsubtyper   r   rc   r    dumpoutline   sJ   








"rt   ZFilespecZEmbeddedFilec           
         s   dt dtttf dd f fdd}t| dA}t|}t|| t } jD ](}|	 D ]!} 
|}	||vrNt|	trN|	dtu rN|| |||	 q-q'W d    d S 1 s[w   Y  d S )Nr9   r*   r   c                    s   t j|dptt|d }|d dp |d d} |j}t	|t
s6d|d}t||dturEtd|dt j| dd	| }t j|r_td
|td| t jt j|dd t|d}||  W d    d S 1 sw   Y  d S )NZUFFZEFz%unable to process PDF: reference for z is not a PDFStreamTypez is not an EmbeddedFileZ06d-zfile exists: zextracting: T)exist_okwb)ospathbasenamerj   r   r#   decoderN   r9   r"   r   r   LITERAL_EMBEDDEDFILEjoinexistsr   rO   makedirsdirnamerf   r1   r7   )r9   r*   filenameZfilereffileobj	error_msgr{   r)   r>   rZ   r   r    extract1   s&   &


"z!extractembedded.<locals>.extract1r[   rv   )intr2   r$   r   rf   r   r   rK   rD   rL   rN   r"   rj   LITERAL_FILESPECrM   )
rU   rX   rZ   r   rm   rn   Zextracted_objidsrB   r9   r*   r   r   r    extractembedded   s$   $




"r   c	                 C   s   t |de}	t|	}
t|
|}|r"|D ]}||}t| ||d q|rNtt|D ]"\}}||v rM|rG|jD ]}t	|}t| ||d q8q+t| |j
 q+|rWt| ||| |sc|sc|sct| || W d    n1 smw   Y  |dvr}| d d S d S )Nr[   rJ   )r.   r/   r-   )rf   r   r   rN   r5   rg   r   rh   contentsr   r8   rR   rI   r1   )rT   rU   rV   rW   rX   rY   r+   rZ   r?   rm   rn   r>   r9   r*   r\   r]   r   r   r    dumppdf   s4   


r   c                  C   sd  t tdd} | jdtd ddd | jddd	d
tj d | jdddddd |  }|jdddddd |jddtdd | jddd}|jdtd ddd |jddtd d |jd!d"td#d |jd$d%ddd&d |jd'dd(d) |jd*d+td,d-d. | jd/d0d}|jd1d2td3d4d. | }|jd5d6ddd7d |jd8d9ddd:d |jd;d<ddd=d | S )>NT)descriptionadd_helpfiles+zOne or more paths to PDF files.)typedefaultnargshelpz	--versionz-vversionzpdfminer.six v)rr   r   z--debugz-dF
store_truezUse debug logging level.)r   rr   r   z--extract-tocz-TzExtract structure of outlinez--extract-embeddedz-EzExtract embedded files)r   r   ParserzUsed during PDF parsing)r   z--page-numbersz0A space-seperated list of page numbers to parse.z	--pagenosz-pzA comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.z	--objectsz-iz1Comma separated list of object numbers to extractz--allz-az3If the structure of all objects should be extractedz--show-fallback-xrefzAdditionally show the fallback xref. Use this if the PDF has zero or only invalid xref's. This setting is ignored if --extract-toc or --extract-embedded is used.)rr   r   z
--passwordz-PrS   z,The password to use for decrypting PDF file.)r   r   r   OutputzUsed during output generation.z	--outfilez-orw   zJPath to file where output is written. Or "-" (default) to write to stdout.z--raw-streamz-rz%Write stream objects without encodingz--binary-streamz-bz)Write stream objects with binary encodingz--text-streamz-tz"Write stream objects as plain text)	r   __doc__add_argumentr$   pdfminer__version__add_mutually_exclusive_groupadd_argument_groupr   )rn   Zprocedure_parserZparse_paramsZoutput_paramsZcodec_parserr   r   r    create_parser  s   
r   argvc           	      C   sX  t  }|j| d}|jrt tj |jr"dd |jdD ng }|j	r0dd |j	D }n|j
r?dd |j
dD }nt }|j}|jrKd}n|jrQd	}n|jrWd
}nd }|jdkratjnt|jd;}|jD ].}|jr~t||||||j|d d qk|jrt|||jd qkt||||||j|d |jd	 qkW d    d S 1 sw   Y  d S )N)argsc                 S   s   g | ]}t |qS r   r   rA   xr   r   r    
<listcomp>      zmain.<locals>.<listcomp>,c                 S   s   h | ]}|d  qS r_   r   r   r   r   r    	<setcomp>  r   zmain.<locals>.<setcomp>c                 S   s   h | ]}t |d  qS r   r   r   r   r   r    r     s    r.   r/   r0   rw   w)rX   rY   r+   rZ   )rX   rZ   )rX   rY   r+   rZ   r?   )r   
parse_argsdebuglogging	getLoggersetLevelDEBUGobjectssplitZpage_numbersrW   rK   rX   Z
raw_streamZbinary_streamZtext_streamoutfilesysstdoutrf   r   Zextract_tocrt   rE   Zextract_embeddedr   r   r?   )	r   rn   r   rV   rW   rX   r+   rT   rU   r   r   r    main  s`    

"r   __main__r@   )F)NF)rS   FNN)rS   FNNF)?r   r   Zos.pathrz   rer   argparser   collections.abcr   r   typingr   r   r   r   Zpdfminer.pdfdocumentr   r	   r
   Zpdfminer.pdfexceptionsr   r   r   r   Zpdfminer.pdfpager   Zpdfminer.pdfparserr   Zpdfminer.pdftypesr   r   r   r   Zpdfminer.psparserr   r   r   Zpdfminer.utilsr   basicConfigr   __name__rF   compiler%   r$   r#   r(   ri   r5   boolrI   rR   r   rt   r   r~   r   r   r   r6   r   r   r   r   r    <module>   s   

 D

	
;,	

#{
;
