
    -ig                         d Z ddlZddlZddlZej                            d           ddlZddlmZ	 ddl
ZddlmZ 	 ddlZddlZn#  ddlmZ ddlmZ Y nxY w G d dej                  ZdS )z
    Arabic stopword stemmer
    N.)stopwords_lexiconc                       e Zd Zd Zed             Zd Zed             Zed             Zd Z	ed             Z
ed             Zd	 ZddZddZd Zg fdZddZd ZdS )stopword_affixerc                 Z   t           j                             |            t          j        | _        g | _        t          j        | _        t          j        | _	        t          j
        | _        t          j        | _        t          j        | _        g | _        t          j        | _        t          j        | _        t+          t          j                                                  | _        g | _        t+          t          j                                                  | _        t+          t          j                                                  | _	        t          j        | _        g | _        t          j        | _        t          j        | _        t5                      | _        t          j        | _        d S N)basic_affixer__init__SSCCOMP_PREFIX_LIST
procleticsprefixesCONJ_SUFFIX_LISTsuffixesCOMP_SUFFIX_LIST	encliticsSTOPWORDS_CONJUGATION_AFFIXaffixesCOMP_STOPWORDS_AFFIXEScliticsCOMP_PREFIX_LIST_TAGSprocletics_tagsprefixes_tagsCONJ_SUFFIX_LIST_TAGSsuffixes_tagsCOMP_SUFFIX_LIST_TAGSenclitics_tagslist%VOCALIZED_INDEX_COMP_PREFIX_LIST_TAGSkeys%VOCALIZED_INDEX_CONJ_SUFFIX_LIST_TAGS%VOCALIZED_INDEX_COMP_SUFFIX_LIST_TAGSr   
dictionary	AJUSTMENTajustment_table)selfs    W/var/www/html/speakWrite/venv/lib/python3.11/site-packages/alyahmor/stopword_affixer.pyr
   zstopword_affixer.__init__&   s*   #,,T444.,-61  #8 6!7 sHMMOOPPSFKKMMNNcGLLNNOO"H F!G
 ,--  #}    c                 t   t          | g          }|r4|t          j        t          j        z   t          j        t          j        z   fv r$| t          j        z   }|                    |           |                     t          j                  r,| dd         t          j        z   }|                    |           |}|S )u  
        Generate the Stop stem variants according to the affixes.
        For example مدرستي = >مدرست+ي = > مدرسة +ي.
        Return a list of possible cases.
        @param stem: the input stem.
        @type stem: unicode.
        @param suffix_nm: suffix (no mark).
        @type suffix_nm: unicode.
        @return: list of stem variants.
        @rtype: list of unicode.
        N)setarYEHNOONWAWaddendswithALEF_MAKSURA)stem	suffix_nmpossible_stop_listpossible_stopvalidated_lists        r'   get_stem_variantsz"stopword_affixer.get_stem_variantsZ   s    " !"
    	2I"&27*:*,&27*:*< < < 26MM""=111==   	2 "I7M""=111+r(   c                 @   t          j        |          }|}t          j        |          }|sC|dd         t           j        t           j        t           j        fv rt          j        |          rd}d| j        |         d         v rt          j        |          }n|}||fS )u$  
        Get the suffix variant to be joined to the word.
        For example: word = مدرس, suffix = ة, encletic = ي.
        The suffix is converted to Teh.
        @param word: word found in dictionary.
        @type word: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: variant of suffixes  (vocalized suffix and vocalized
        suffix without I'rab short mark).
        @rtype: (unicode, unicode)
        r*   N 
   متحركtags)r,   strip_tashkeelstrip_lastharakar2   r-   ALEF	is_harakar   )r&   wordsuffixencliticenclitic_nm	newsuffixsuffix_non_irab_marks          r'   get_suffix_variantsz$stopword_affixer.get_suffix_variantsz   s     '11	"4(( 	tBCCy-    \&11  I
 D.v6v>>>#%#6y#A#A  #, ...r(   c                    t          j        |           }t          j        |           }|                    t           j                  rE|                    t           j        t           j        z             r|dd         t           j        z   }nP|                    t           j        t           j	        z             r9|                    t           j	                  rt           j        t           j
        z   }n|                    t           j        t           j	        z             re|                    t           j                  rF|                    t           j        t           j        z             st           j        t           j
        z   }nZ|                    t           j        t           j	        z             r.|                    t           j	        t           j        z             rd}|}||fS )u  
        Get the enclitic variant to be joined to the word.
        For example: word = عن, suffix = , encletic = ني.
        The word and enclitic are geminated.
        @param word: word found in dictionary.
        @type word: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: variant of suffixes  (vocalized suffix and vocalized
        suffix without I'rab short mark).
        @rtype: (unicode, unicode)
           Nr:   )r,   r>   r=   
startswithr.   r1   SUKUNSHADDAKASRAr-   FATHAr2   r/   )rA   rC   word_semi_vocword_nmenclitic_non_irab_marks        r'   get_enclitic_variantz%stopword_affixer.get_enclitic_variant   sr   " +D11#D))rw'' 	M,B,B27RXCU,V,V 	|bi/HH  BF!233 	8N8Nrv8V8V 	y28+HH  BF!233 	8N8Nr8_8_ 	##BF2?$:;; 09rx/  BF!233 	8N8NrvXZXaOa8b8b 	H "*///r(   c                    t          j        |           }t          j        |           }t          j        |          }t          j        |          }|                    t           j        t           j        z   t           j        z             r)|                    t           j                  r
|dd         }|                    t           j        t           j        z   t           j        z   t           j        z             r*|                    t           j                  r|dd         }nU|                    t           j        t           j        z             r)|                    t           j                  r
|dd         }|dk    r6|                    t           j                  r|dd         t           j	        z   }|                    t           j
                  rk|ri|                    t           j        t           j
        z             r|dd         t           j        z   }n|dd         t           j        z   t           j        z   }n|                    t           j                  r|r|                    t           j                  r|dd         t           j        z   }nj|                    t           j                  r|dd         t           j        z   }n3t           j        |v st           j        |v r|dd         t           j        z   }|r(|d         t           j        v rt          j        |          }|                    t           j                  rA|                     t           j        t           j        z             rt          j        |          }nl|                    t           j        t           j        z             r@|                     t           j        t           j        z             rt          j        |          }|S )uz  
        Get the word variant to be joined to the suffix.
        For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
        @param word: word found in dictionary.
        @type word: unicode.
        @param suffix: suffix ( firts or second level).
        @type suffix: unicode.
        @return: variant of word.
        @rtype: unicode.
           N   rI   r:   r*   r   )r,   r>   r=   rJ   r?   LAMr1   rK   TEH_MARBUTATEHr2   r/   r-   HAMZADAMMA	WAW_HAMZArM   	YEH_HAMZAKAFBEHHARAKATr.   )rA   	procliticrB   	word_stemword_stem_nmr4   proc_nms          r'   get_word_variantz!stopword_affixer.get_word_variant   s>    '--	(..%f--	#I.. "& 026 9:: 	&w?O?OPRPV?W?W 	&!!""I"& 028 ;bf DEE 	&'JZJZ[][aJbJb 	&!!""II!!"'BF"233 	&8H8H8P8P 	&!!""I ??y11".AA?!#2#/Ibo.. 	:9 	:$$RVBO%;<< ?%crcNRW4		%crcNRV3bh>		)) 	:i 	:  ** :%crcNR\9		""28,, :%crcNR\9		7""bf&7&7%crcNR\9	  	7fQi2:--+I66I RW%% 	7$--"(8J*K*K 	7+I66IIrx"&011 	7dmm!7# 7# 	7+I66Ir(   c                    g }|}| j                             |i                               d|          D ]#}| j                            |i                               d|          D ]}|                     ||          \  }}	|                     ||||z             }|                     |||          \  }}
d                    |||
|g          }d                    ||||g          }d                    ||||g          }t          j        |          }| 	                    |          }| 	                    |          }|
                    |||f           %|S )a  
        Join the  stop and its affixes, and get the vocalized form
        @param stop: stop found in dictionary.
        @type stop: unicode.
        @param proclitic: first level prefix.
        @type proclitic: unicode.

        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: vocalized word.
        @rtype: unicode.
        	vocalizedr:   -)r   getr   rR   rd   rG   joinr,   r=   ajust_vocalizationappend)r&   stopr`   rB   rC   word_tuple_list
suffix_vocproclitic_vocenclitic_vocencl_voc_non_inflectrF   word_non_irab_markword_vocalized	segmenteds                 r'   vocalizezstopword_affixer.vocalize   s    
 "155iDDHHV_`` "	X "	XM $ 3 7 7" E E I I+W_ ` ` !X !X595N5NtO[6] 6]22 ,,T=&8BSTT 483K3K*l44 440
0 &(WW"D*>M&O &O" "$-z<)X!Y!Y  HHmT:|%TUU	-i88	%)%<%<=O%P%P"!%!8!8!H!H&&8JI'VWWWWC!XD r(   c                 $      fd|D             S )ad  
        Verify possible affixes in the resulted segments according
        to the given affixes list.
        @param word: the input word.
        @type word: unicode.
        @param list_seg: list of word segments indexes (numbers).
        @type list_seg: list of pairs.
        @return: list of acceped segments.
        @rtype: list of pairs.
        c                 ~    g | ]9}d                      d|d                  |d         d         g          v 7|:S )rg   Nr   rI   )ri   ).0s
affix_listrA   s     r'   
<listcomp>z1stopword_affixer.verify_affix.<locals>.<listcomp>G  sW     
 
 
xxeqted1Q455k233zAA AAAr(    )rA   list_segrz   s   ` `r'   verify_affixzstopword_affixer.verify_affix;  s4    
 
 
 
 

 
 
 	
r(   c                    t          |t                    r|                    d          }d|v r
| d         sdS d|v r
| d         sdS d|v r
| d         rdS d|v r
| d	         sdS d
|v r
| d	         sdS d|v r| d	         rd|vrdS d|v r
| d         sdS d|v r
| d	         sdS d|v r
| d         sdS d|v r
| d         rdS |t          j        k    r
| d	         sdS d|v r/| d	         s%| d                             t          j                  rdS d|v rd| d         vrdS d|v r
| d         sdS d|v r
| d         sdS d|v r
| d         sdS dS )a  
        Test if the given word from dictionary is compabilbe with affixes tags.
        @param stop_tuple: the input word attributes given from dictionary.
        @type stop_tuple: dict.
        @param affix_tags: a list of tags given by affixes.
        @type affix_tags:list.
        @param procletic: first level prefix vocalized.
        @type procletic: unicode.
        @param encletic_nm: first level suffix vocalized.
        @type encletic_nm: unicode.
        @return: if the tags are compaatible.
        @rtype: Boolean.
        :u   عطفhas_conjuctionF
   تعريفhas_definition
is_definedu
   مجرورis_inflected
   مرفوع   جرhas_prepositionr;      مضافhas_pronounu
   وقايةrA   u   فعل	type_wordu   استفهامhas_interrogu   قسم	has_qasam
   تنوينtanwinT)
isinstancestrsplitr,   r-   r1   )
stop_tuple
affix_tags	procleticencletic_nms       r'   validate_tagszstopword_affixer.validate_tagsL  s   $ j#&& 	/#))#..J
"":6F+G"5J&&z:J/K&5J&&:l+C&5J&&z./I&5J&&z./I&5j  Z%? YcHcHc5j  4E)F 5J&&z./I&5*$$Z-F$5*$$L)A$5"&  N)C 5 J&&J~,F&*4V*<*E*Ebf*M*M '5J&&8:k;R+R+R5
**:n3M*5 
"":k+B"5
 J&&z(/C&5 tr(   c                 <    | j                             ||          }|S )z
        ajust vocalization
        Temporary function
        @param vocalized: vocalized word.
        @type vocalized: unicode.
        @return: ajusted vocalized word.
        @rtype: unicode.
        )r%   rh   )r&   rf   ajusteds      r'   rj   z#stopword_affixer.ajust_vocalization  s!     &**9i@@r(   r:   c                    g }|                      ||          rq|                     |||          rZ|s|                     ||||          }|                     ||||          }|r&d |D             }|D ]}|                    |           |S )z generate stopword form c                 ,    g | ]}t          |          S r|   )r   )rx   xs     r'   r{   z-stopword_affixer.get_form.<locals>.<listcomp>  s    #B#B#BDGG#B#B#Br(   )is_valid_cliticscheck_clitic_affixget_tagsru   rk   )	r&   rA   procprefsuffencr<   newword_list
word_tuples	            r'   get_formzstopword_affixer.get_form  s      s++ 		0&&tS$77 0 @==tT3??D#}}T4sCC 0#B#B\#B#B#BL&2 0 0
"))$////r(   Nc           
      P   g }t          j                  }|s| j                            |d          }|k    r!t	          |          dk    rfd|D             }|D ]}|                    d          }t          j        | j        | j	        | j
                  D ]|}|d         }|d         }	|d         }
|                     |||	|
          }	 |                     ||||
          r1|                     ||d|	|
|          }|r|                    |           }d |D             }|S )z generate all possible affixes
        We can genearte stopwords based on stop_tuple_list
        preprared to generate all forms according to a csv file.
        If stop_tuple_list = None: the current library use arabicstopwords library.
        T)lemmarI   c                 f    g | ]-}t          j        |                    d                     +|.S )rf   )r,   vocalizedlikerh   )rx   sprA   s     r'   r{   z3stopword_affixer.generate_forms.<locals>.<listcomp>  sO     U U Ub"$"2266+t3L3Ld"S"SUr U U Ur(   rf   r   rT   Frg   	r:   c                     g | ]}||S r|   r|   )rx   r   s     r'   r{   z3stopword_affixer.generate_forms.<locals>.<listcomp>  s    <<<<"<<<r(   )r,   r=   r#   get_stopwordtupleslenrh   	itertoolsproductr   r   r   r   ri   printr   r   r   extend)r&   rA   stop_tuple_liststopword_formsrP   r   vocalized_wordelementr   r   r   r<   affixaffix_nmr   s    `             r'   generate_formszstopword_affixer.generate_forms  s     #D)) 	V"o@@PT@UUO
 7??s?33a77U U U UO U U UO) $	< $	<J'^^K>>N$,T_dmT^\\ "< "<qzqzaj  }}^T4EE] %%j$cBB < $(==r4QTVZ#[#[L# <&--l;;;E"<L =<~<<<r(   c                    g }| j                             |i                               dd          }|                    |           | j                            |i                               dd          }|                    |           | j                            |i                               dd          }|                    |           d |D             }d                    |          S )z+
        Get affixes tags
        
        r<   r|   c                     g | ]}||S r|   r|   )rx   ts     r'   r{   z-stopword_affixer.get_tags.<locals>.<listcomp>  s    ++++1+++r(   r   )r   rh   r   r   r   ri   )	r&   rA   r   rB   rC   taglistproclitic_tagsenclitic_tagssuffix_tagss	            r'   r   zstopword_affixer.get_tags  s    
 -11)R@@DDVRPP~&&&+//"==AA&"MM}%%%(,,VR88<<VRHH{###
 ,+g+++xx   r(   c                     g }|d         }|d         }|d         }|| j         vs|| j        vs	|| j        vrdgS |                     ||d||          }|S )z2 generate all possible word forms by given affixesr   rT   rU   )Zerroukitahar:   )r   r   r   r   )r&   rA   r   r   r   r   r   s          r'   generate_by_affixesz$stopword_affixer.generate_by_affixes  su     qzqzaj
 ''4t}+D+DSWSaHaHa())tT2tSAAr(   Tc                     d|                                }|sd |D             }nd |D             }t          t          |                    }fd|D             }|S )z generate all affixes u   قصدc                 B    g | ]}t          j        |d                    S r   )r,   r=   rx   ds     r'   r{   z8stopword_affixer.generate_affix_list.<locals>.<listcomp>!  s'    LLLB-ad33LLLr(   c                     g | ]
}|d          S r   r|   r   s     r'   r{   z8stopword_affixer.generate_affix_list.<locals>.<listcomp>#  s    999QAaD999r(   c                 <    g | ]}|                     d           S )rg   )replace)rx   r   rA   s     r'   r{   z8stopword_affixer.generate_affix_list.<locals>.<listcomp>'  s'    CCC		$,,CCCr(   )r   r   r+   )r&   rf   r   list_affixesrA   s       @r'   generate_affix_listz$stopword_affixer.generate_affix_list  s    ,,T22 	:LL^LLLLL99.999LC--..CCCClCCCr(   c                 |   | j                             |i                               dd          }| j                            |i                               dd          }| j                            |i                               dd          }|| j                            |i                               dd          z  }d|v rd|v rdS d|v rd|v rdS d|v rd|v rdS d|v rd	|v rdS d
|v rd|v rdS d
|v rd|v rdS |                    t
          j                  r!|                    t
          j                  rdS dS )a  
        Verify if proaffixes (sytaxic affixes) are compatable
        with affixes ( conjugation)
        @param proclitic_nm: first level prefix.
        @type proclitic_nm: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @return: compatible.
        @rtype: True/False.
        r<   r|   casesr   r   Fu
   إضافةr   u   لايضافr   r   u
   منصوبT)	r   rh   r   r   rJ   r,   r-   r1   rZ   )r&   proclitic_nmrC   rB   r   r   r   s          r'   r   z#stopword_affixer.check_clitic_affix+  s     -11,CCGGPRSS+//"==AA&"MM(,,VR88<<VRHH 	t)--fb99==grJJJ N**}/K/K5n,,+1M1M5 M))m{.J.J5M))o.L.L5&&=K+G+G5&&=K+G+G5  (( 	V__RX-F-F 	5 4r(   )r:   r:   r:   r:   r   )T)__name__
__module____qualname__r
   staticmethodr8   rG   rR   rd   ru   r~   r   rj   r   r   r   r   r   r   r|   r(   r'   r   r   %   s]       2- 2- 2-h   \> /  /  /D "0 "0 \"0H 8 8 \8t= = =~ 
 
 \
  D D \DL      @ @ @ @D! ! !* 13    &   "B B B B Br(   r   )__doc__resyspprintpathrk   r   pyarabic.arabyarabyr,   $arabicstopwords.stopwords_classifiedarabicstopwords!arabicstopwords.stopwords_lexiconr   r	   aly_stem_stopword_constr   alyahmor.basic_affixer alyahmor.aly_stem_stopword_constr   r|   r(   r'   <module>r      s    
			 



                + + + + ? ? ? ? ? ?3)))))322222222222222H	 H	 H	 H	 H	}2 H	 H	 H	 H	 H	s   A A