o
    Nhh                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dl Z d dlZd dlm	Z	 d dl
Zd dlmZ e  ejdeejddejdejdejd	d
Zdd Zdd Zdd Zdd Zdd Zdd Ze Zdd Zdd Zedkr~e  dS dS )    N)Document)load_dotenv
MYSQL_HOST
MYSQL_PORTi  
MYSQL_USERMYSQL_PASSWORDMYSQL_DATABASE)hostportuserpassworddatabasec            
      C   s   t jjdi t} |  }|d | }W d    n1 s!w   Y  i }|D ]0\}}}}|r5|gng }|dd |dD 7 }|sM|t	
| dd| d }	|||	< q*|S )	Nz
            SELECT id, label, COALESCE(regex_pattern, ''), COALESCE(synonyms, '')
            FROM business_case_factors
            WHERE is_active = 1
        c                 S   s"   g | ]}|  rt|  qS  )stripreescape).0sr   r   -/var/www/html/ai_worker/parser_extract_kpi.py
<listcomp>!   s   " z#load_factor_map.<locals>.<listcomp>,(|)r   )mysql	connectorconnectcfgcursorexecutefetchallsplitappendr   r   join)
cnxcurrows
factor_mapfidlabelrexsynpatternspatternr   r   r   load_factor_map   s   

	
r.   c                 C   s.   t  D ]\}}t|| tjr|  S qd S N)
FACTOR_MAPitemsr   searchI)textpatr(   r   r   r   guess_factor,   s
   r6   c                 C   s"   t d| }|rt|dS d S )Nu   (-?\d+(?:\.\d+)?)(?:\s?[%€])?   )r   r2   floatgroup)r4   mr   r   r   grab_numbers2   s   r;   c           	      C   sf   t | }g }|jD ]&}|jD ] }|jsq|jj}t|}t|}|r/|d ur/|	|||f qq
|S r/   )
pptxPresentationslidesshapeshas_text_frame
text_framer4   r6   r;   r"   )	pathsector_hintprsr&   slideshapetxtr(   valr   r   r   
parse_pptx8   s   


rI   c                 C   sZ   t | }g }|D ]!}|  D ]}t|}t|}|r)|d ur)||||f qq	|S r/   )fitzopenget_text
splitlinesr6   r;   r"   )rB   rC   docr&   pageliner(   rH   r   r   r   	parse_pdfE   s   
rQ   c                 C   s   t | }g }|jD ] }|j }|sq	t|}t|}|r)|dur)||||f q	|jD ])}|jD ]#}	d	dd |	j
D }
t|
}t|
}|rU|durU||||f q2q-|S )z
    Estrae coppie (sector, factor_id, best_value) da un file Word.
    Funziona sia su paragrafi plain-text sia su tabelle.
    N c                 s   s    | ]}|j  V  qd S r/   )r4   r   )r   cellr   r   r   	<genexpr>f   s    zparse_docx.<locals>.<genexpr>)r   
paragraphsr4   r   r6   r;   r"   tablesr&   r#   cells)rB   rC   rN   r&   parrG   r(   rH   tablerowtxt_rowr   r   r   
parse_docxP   s*   



r\   c                 C   s   t | }||g d  jdd }tjjdi t+}| }|j	ddD ]}t 
|jr1d n|j}|d|j|j|f q'W d    n1 sJw   Y  td d S )	Nsector	factor_id
best_valuer7   )axisF)indexz
              INSERT INTO case_study_kpi (sector, factor_id, best_value)
              VALUES (%s,%s,%s)
              ON DUPLICATE KEY UPDATE best_value = VALUES(best_value)
            u   📥 Import terminator   )pdread_csvnotnaallr   r   r   r   r   
itertuplesisnar`   r   r^   r_   print)csvfiledfcr%   rZ   rH   r   r   r   load_into_dbp   s   

rm   c                  C   s   t jdd \} }}g }| drtt| |}n| dr&tt| |}n| dr2tt| |}t|ddd}t	|}|
g d	 || W d    n1 sUw   Y  t| d S )
Nr7      z.pptxz.pdfz.docxw )newliner]   )sysargvendswithlistrI   rQ   r\   rK   csvwriterwriterow	writerowsrm   )in_fileout_csvr^   r&   fro   r   r   r   main   s   



r}   __main__)r   rv   rr   r<   rJ   mysql.connectorr   osdocxr   pandasrc   dotenvr   environgetintr   r.   r6   r;   rI   rQ   r\   r0   rm   r}   __name__r   r   r   r   <module>   s0   ( 




