
    -iO              
          d Z ddlZddlmZ ddlZddlZddlmZ ddl	m
Z ddl	mZ i dddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.Z G d/ d0          Zd1 Zd2 Zd3 Zd4 Zd5 Zd6 ZdS )7zX
Arabic unknown word stemmer.
Unkown words are stemmed as nouns with another dictionary
    N   )stem_noun_const)wordcaseid	vocalizedunvocalized   wordtype   root   
normalized   stamped   original   mankous   	feminable	   number
   dualable   masculin_plural   feminin_plural   broken_plural   mamnou3_sarf   relative                           )w_suffix	hm_suffix
kal_prefix	ha_suffixk_suffixannex
definitionnotec                   8    e Zd ZdZd
dZd Zd Zd Zd Zd Z	d	S )UnknownStemmerzd
    Arabic unknown word stemmer.
    Unkown words are stemmed as nouns with another dictionary
    Fc                    t           j                                        | _        | j                            t
          j                   | j                            t
          j                   | j        	                    t
          j
                   | j                            t
          j                   | j                            t
          j                   | j                            t
          j                   | j                            t
          j                   | j                            t
          j                   t           j                                        | _        | j                            t
          j                   | j                            t
          j                   | j        	                    t
          j                   | j                            t
          j                   | j                            t
          j                   | j                            t
          j                   | j                            t
          j                   | j                            t
          j                   t=          j        dt<          j                   | _!        | j!        | _"        i | _#        i | _$        || _%        d S )Nwordfreq)&
tashaphynestemmingArabicLightStemmercomp_stemmerset_infix_letterssnconstCOMP_INFIX_LETTERSset_prefix_lettersCOMP_PREFIX_LETTERSset_suffix_lettersCOMP_SUFFIX_LETTERSset_max_prefix_lengthCOMP_MAX_PREFIXset_max_suffix_lengthCOMP_MAX_SUFFIXset_min_stem_lengthCOMP_MIN_STEMset_prefix_listCOMP_PREFIX_LISTset_suffix_listCOMP_SUFFIX_LISTconj_stemmerCONJ_INFIX_LETTERSCONJ_PREFIX_LETTERSCONJ_SUFFIX_LETTERSCONJ_MAX_PREFIXCONJ_MAX_SUFFIXCONJ_MIN_STEMCONJ_PREFIX_LISTCONJ_SUFFIX_LISTwordfreqdictionaryclassWordFreqDictionaryWORDFREQ_DICTIONARY_INDEXr9   noun_dictionary
noun_cachenoun_vocalize_cachedebugselfr^   s     R/var/www/html/speakWrite/venv/lib/python3.11/site-packages/qalsadi/stem_unknown.py__init__zUnknownStemmer.__init__E   s   &/BBDD++G,FGGG,,W-HIII,,W-HIII//0GHHH//0GHHH--g.CDDD))'*BCCC))'*BCCC '/BBDD++G,FGGG,,W-HIII,,W-HIII//0GHHH//0GHHH--g.CDDD))'*BCCC))'*BCCC/B/I
 
  $}#% 


    c           
         g }|                                 }|g}|                    t          j                  dk    rJ|                    |                    t          j        t          j        t          j        z                        |D ]}| j                            |          }t          ||t          j                  }|D ]}|d|d                  }||d         |d                  }||d         d         }| j        r9t          dd                    |||g                              d                     |g}	|dk    r|                    t          j                  r#|	                    |t          j        z              n|                    t          j                  r#|	                    |t          j        z              n|                    t          j                  r+|	                    |dd         t          j        z              nI|                    t          j                  r*|	                    |dd         t          j        z              g }
|	D ]}|
|                     ||||          z  }
||
z  }Ð|S )	z
        Analyze word morphologically as noun
        @param noun: the input noun.
        @type noun: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        r   Nr   	-utf8 )stripfindaraby
ALEF_MADDAappendreplaceALEF_HAMZA_ABOVEr=   segmentverify_affixr?   COMP_NOUN_AFFIXESr^   printjoinencodeendswithYEHNOONWAWALEFALEF_MAKSURATEHTEH_MARBUTAsteming_second_level)r`   noundetailed_result	noun_listlist_seg_compseg	procleticstemencletic	list_stemresults              ra   stemming_nounzUnknownStemmer.stemming_nouni   sy    zz||F	99U%&&!++$e&<u?U&U     	* 	*D -55d;;M(}g>WXXM$ * * 3q6N	CFSVO,A> : V$)T8)D E E L LV T TUUU "F	r>>}}UY// H!((
):;;;;uy11 H!((
):;;;;uz22 H!((crcU5G)GHHHHuy11 H!((crcU5F)FGGG % Y YDd77dIxXXXFF6)5*6 rc   c                 B   g }| j                             |          }t          ||t          j                  }g }|D ]}|d|d                  }	||d         |d                  }
||d         d         }t          j        |         d         D ].}d||
d}t          |||          r|                    |           /|}|D ]]}|d         }	|d         }
|d	         }t          j	        
                    |
          }
t          |
|	|          }g }|D ]}|                     |          }||z  }|D ]}|d         }d
}|d         }|                     |||	||          }|                    t          j        |||	||f|
||||d                    t          j        |         d         t          j        |         d         z   t          j        |         d         z             d                    d|g          |d         d                    |          dd                     _|S )a%  
        Analyze word morphologically by stemming the conjugation affixes.
        @param noun: the input noun.
        @type noun: unicode.
        @param noun2: the noun stemed from syntaxic affixes.
        @type noun2: unicode.
        @param procletic: the syntaxic prefixe extracted in the fisrt stage.
        @type procletic: unicode.
        @param encletic: the syntaxic suffixe extracted in the fisrt stage.
        @type encletic: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        Nr   r   r   rh   )prefixsuffixr   r   r   r    	word_type:tagsNounfreq)wordaffixr   r   lemmar   semivocalizedr   typer   originaltagssyntax)rO   rq   rr   r?   NOMINAL_CONJUGATION_AFFIXCONJ_SUFFIX_LIST_TAGSis_compatible_proaffix_affixrn   r:   	normalizenormalize_hamzaget_stem_variantslookup_dictvocalizer   WordCaseru   COMP_PREFIX_LIST_TAGSCOMP_SUFFIX_LIST_TAGS)r`   r   noun2r   r   r   list_seg_conjlist_seg_conj_vocseg_conjprefix_conj	stem_conjsuffix_conjvocalized_suffixseg_conj_vocpossible_noun_listinfnoun_form_listinfnouninfnoun_foundlist
noun_tupleoriginal_tagsr
   r   s                         ra   r   z#UnknownStemmer.steming_second_level   s    )11%88$='"C
 

 % 	; 	;H(1+.KhqkHQK78I.K %,$A+$N% ; ;  !.%    0	8EUVV ; &,,\:::; *% 6	 6	H"8,K (I"8,K #,<<YGGI "39k;!W!W !#- 7 7 %)$4$4W$=$=!!%66!!/  
$[1 "%k2 MMY[( 	  &&%$(&/k8%T$-(/%,)2-6$'HH ' =i H P")"?"I&"Q!R")"?"LV"T!U% %
 %(HHfh-?$@$@$.v$6,/HH],C,C&(!     B rc   c                     || _         dS )z
        Set the debug attribute to allow printing internal analysis results.
        @param debug: the debug value.
        @type debug: True/False.
        N)r^   r_   s     ra   	set_debugzUnknownStemmer.set_debug  s     


rc   c                 |    || j         v r| j         |         S | j                            |d          }|| j         |<   |S )z)
        lookup for word in dict
        unknown)r\   r[   lookup)r`   r   r   s      ra   r   zUnknownStemmer.lookup_dict  sG     4?""?4(()00yAAF$*DOD!rc   c                    d                     |||||g          }|| j        v r| j        |         S t          j        |         d         d         }t          j        |         d         d         }|}	|dd         t
          j        v r
|dd         }|                    t
          j        t
          j	        t
          j        z             }|                    t
          j
        t
          j	        t
          j
        z             }t          j        dt
          j	        z  t
          j	        |          }t          j        dt
          j	        z  d|          }|                    t
          j        t
          j        z             rst          j        |d                   rYd                     |d         t
          j        |d	d         g          }|                    t
          j                  r
|dd         }t%          ||          }t%          ||          }t'          ||	|          }	d                     ||||	|g          }
|
| j        |<   |
S )
a  
        Join the  noun and its affixes, and get the vocalized form
        @param noun: noun found in dictionary.
        @type noun: unicode.
        @param proclitic: first level prefix.
        @type proclitic: unicode.
        @param prefix: second level suffix.
        @type prefix: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: vocalized word.
        @rtype: unicode.
        rf   r   r   ri   Nz(%s)+z^(%s)+rh   r   )ru   r]   r?   r   r   rl   HARAKATro   r{   FATHAr|   resubrw   LAMis_sunSHADDASUKUNget_word_variantget_suffix_variant)r`   r   	procliticr   r   enclitickeyenclitic_vocproclitic_voc
suffix_voc	noun_conjs              ra   r   zUnknownStemmer.vocalize  s   " hhiBCC$***+C004X>{KAN5i@MaP
 9%%9D ||EJej(@AA ||E.e>P0PQQvg+U[$?? vh,b$77
 ej59455 	3%,tAw:O:O 	377DGU\48<==D%%ek22 3 -crc 2f--h//'j(CC
GG]FD*lSTT	(1 %rc   N)F)
__name__
__module____qualname____doc__rb   r   r   r   r   r   r   rc   ra   r7   r7   ?   s         
" " " "H4 4 4le e eN  
 
 
7 7 7 7 7rc   r7   c                    | dk    r|dk    rdS t           j        |          d         }t           j        |         d         }t           j        |         d         }d|v r
d|v rd|vrdS d|v rd|v rdS d|v rd|v rdS d|v rd	|v rdS d
|v rd|vrdS dS )a`  
    Verify if proaffixes (sytaxic affixes) are compatable
    with affixes (conjugation)
    @param procletic: first level prefix.
    @type procletic: unicode.
    @param encletic: first level suffix.
    @type encletic: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @return: compatible.
    @rtype: True/False.
    rh   Tr   u
   تعريفu   مضافu
   منسوبF
   تنوينu   لايضافu   جرu
   مجرور)r?   r   r   r   )r   r   r   procletic_tagsencletic_tagssuffix_tagss         ra   r   r   T  s     B8r>>t29=fEN1(;FCM/7?K&&+%%++u~%%,+*E*Eu]""|{'B'Bu]""~'D'DuL$C$Cu4rc   c                 t   t          j        |          }|                    t           j                  dk    r=t	          |          dk    r*t          j        t           j        t           j        |          }|dk    r=| dd         t           j        t           j	        t           j
        fv r|t           j        v rd}|S )u  
    Get the suffix variant to be joined to the word.
    For example: word  = مدرس, suffix = ة, encletic = ي.
    The suffix is convert to Teh.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.
    @return: variant of suffix.
    @rtype: unicode.
    r   rh   ri   N)rl   strip_tashkeelrk   r~   lenr   r   r}   r|   rx   r{   r   )r   r   r   enclitic_nms       ra   r   r   x  s     &x00K{{5$%%**s;/?/?!/C/C)59f==rI%,eiDDDem##Mrc   c                    | }t          j        |          }|dd         t           j        v r
|dd         }|                    t           j                  rp|t           j        t           j        z   t           j        t           j        z   t           j        t           j        t           j        z   t           j        z   fv r|dd         }n|                    t           j                  r|dk    r|dd         t           j        z   }nn|                    t           j                  r|dk    r|dd         t           j        z   }n0|                    t           j	                  r|dk    r	|
                    t           j                  r|dd         t           j        z   }n|
                    t           j                  r|dd         t           j        z   }n|                    t           j        t           j	        z             s9|                    t           j        t           j        z   t           j	        z             r6|
                    t           j                  r|dd         t           j        z   }|S )uW  
    Get the word variant to be joined to the suffix.
    For example: word  = ةمدرس, suffix = ي. The word is converted to مدرست.
    @param word: word found in dictionary.
    @type word: unicode.
    @param suffix: suffix ( firts or second level).
    @type suffix: unicode.
    @return: variant of word.
    @rtype: unicode.
    ri   Nrh   )rl   r   r   rw   r~   r{   r}   rx   r|   HAMZA
startswithDAMMA	WAW_HAMZAKASRA	YEH_HAMZAr   FATHATAN)r   r   	word_stem	suffix_nms       ra   r   r     s    I$V,,I~&&crcN	%+,, 9
UY	E%%		EJ*	? 2 2 crcN					E-	.	. 99??crcNUY.					E.	/	/ 9IOOcrcNUY.					EK	(	( 	9Y"__U[)) 	9!#2#8IIu{++ 	9!#2#8IIuy5;677	9!!%)ek"9EK"GHH	9 //	9 "#2#8Irc   c                    t          | g          }t          j        |          }t          j        |          }|}|t          j        t          j        z   t          j        t          j        z   t          j        t          j        t          j        z   t          j        z   fv r$| t          j        z   }|                    |           |dk    s:|t          j        t          j        z   k    s|t          j	        t          j        z   k    r$| t          j        z   }|                    |           | 
                    t          j                  r,| dd         t          j        z   }|                    |           |}|S )u  
    Generate the Noun stem variants according to the affixes.
    For example مدرستي = >مدرست+ي  = > مدرسة +ي.
    Return a list of possible cases.
    @param stem: the input stem.
    @type stem: unicode.
    @param prefix: prefixe.
    @type prefix: unicode.
    @param suffix: suffixe.
    @type suffix: unicode.
    @return: list of stem variants.
    @rtype: list of unicode.
    rh   Nri   )setrl   r   r{   r}   rx   r~   addry   rz   rw   r|   )r   r   r   prefix_possible_noun_listr   possible_nounvalidated_lists          ra   r   r     s9   " !$TF!&))F!&))F2
UY	E%%		EJ*	   u00}---"UY+++UY+++uy(}---}}UY .SbS	E$66}---'Nrc   c                 $      fd|D             S )a@  
    Verify possible affixes in the resulted segments according
    to the given affixes list.
    @param word: the input word.
    @type word: unicode.
    @param list_seg: list of word segments indexes (numbers).
    @type list_seg: list of pairs.
    @return: list of acceped segments.
    @rtype: list of pairs.
    c                 ~    g | ]9}d                      d|d                  |d         d         g          v 7|:S )rf   Nr   r   )ru   ).0s
affix_listr   s     ra   
<listcomp>z verify_affix.<locals>.<listcomp>  sL    XXX!388T&AaD&\4!<,H#I#IZ#W#WA#W#W#Wrc   r   )r   list_segr   s   ` `ra   rr   rr     s$     YXXXXxXXXXrc   c                 *    d|v r| d         dk    rdS dS )aF  
    Test if the given word from dictionary is compabilbe with affixes tags.
    @param noun_tuple: the input word attributes given from dictionary.
    @type noun_tuple: dict.
    @param affix_tags: a list of tags given by affixes.
    @type affix_tags:list.
    @return: if the tags are compatible.
    @rtype: Boolean.
    r   r   	noun_propFTr   )r   
affix_tagss     ra   validate_tagsr     s(     z!!j&=&L&Lu4rc   )r   r   pyarabic.arabyrl   tashaphyne.stemmingr:   tashaphyne.normalize arramooz.wordfreqdictionaryclassrX   rh   r   r?   r   NOUN_DICTIONARY_INDEXr7   r   r   r   r   rr   r   r   rc   ra   <module>r      s    
			               C B B B B B ) ( ( ( ( (      ! 1 	
 A ! q  q  b  r b R  B!" #$ 3   :R R R R R R R Rj! ! !H  6% % %P* * *ZY Y Y"    rc   