
    -iys                         d Z ddlZddlZddlZej                            d           ddlZddlmZ	 	 ddl
Z
ddlZn#  ddlm
Z
 ddlmZ Y nxY wd Zd Z G d de
j
                  ZdS )z
    Arabic noun stemmer
    N.c                 $      fd|D             S )a@  
    Verify possible affixes in the resulted segments according
    to the given affixes list.
    @param word: the input word.
    @type word: unicode.
    @param list_seg: list of word segments indexes (numbers).
    @type list_seg: list of pairs.
    @return: list of acceped segments.
    @rtype: list of pairs.
    c                 ~    g | ]9}d                      d|d                  |d         d         g          v 7|:S -Nr      join.0s
affix_listwords     S/var/www/html/speakWrite/venv/lib/python3.11/site-packages/alyahmor/noun_affixer.py
<listcomp>z verify_affix.<locals>.<listcomp>+   sW       88T%1Q4%[$qtuu+.//:== 	
===     r   list_segr   s   ` `r   verify_affixr       s4           r   c                 `   |}d|v r
| d         rdS | d         dv rd|v rdS d|v rdS d|v rdS |d	v r
| d
         sdS |dv r
| d         sdS d|vr
| d         sdS |                     d          r
| d         sdS |t          j        k    r
| d         sdS |t          j        k    r| d         dk    rdS dS )aF  
    Test if the given word from dictionary is compabilbe with affixes tags.
    @param noun_tuple: the input word attributes given from dictionary.
    @type noun_tuple: dict.
    @param affix_tags: a list of tags given by affixes.
    @type affix_tags:list.
    @param proclitic_nm: first level prefix vocalized.
    @type proclitic_nm: unicode.
    @param enclitic_nm: first level suffix vocalized.
    @type enclitic_nm: unicode.
    @param suffix_nm: first level suffix vocalized.
    @type suffix_nm: unicode.
    @return: if the tags are compatible.
    @rtype: Boolean.
    
   تنوينmamnou3_sarfFnumber)   جمعu   جمع تكسيرu   جمع مؤنث سالمu   جمع مذكر سالمu   مثنى)u   همu   هنu   كماu   كمu   هما	hm_suffix)u   هu   ها	ha_suffix   الk_prefix
kal_prefixw_suffixr   T)endswitharWAWTEH_MARBUTA)
noun_tuple
affix_tagsproclitic_nmenclitic_nm	suffix_nm	proclitics         r   validate_tagsr,   0   s,   $ I 
""z.'A"u(@@@&*445&*445*$$5 " " "*4[*A"u&&&z+/F&ui
:(>u'"" :l+C u BF:j#9uBN""z(';y'H'Hu
 4r   c                       e Zd Zd Zed             ZddZed             Zed             Zd Z	ed             Z
ed	             Zed
             ZddZd Zd Zg fdZddZd ZdS )noun_affixerc                 p   t           j                             |            t          j        | _        g | _        t          j        | _        t          j        | _	        t          j
        | _        t          j        | _        t          j        | _        g | _        t          j        | _        t          j        | _        d S )N)basic_affixer__init__SNCCOMP_PREFIX_LIST
procleticsprefixesCONJ_SUFFIX_LISTsuffixesCOMP_SUFFIX_LIST	encliticsNOMINAL_CONJUGATION_AFFIXaffixesCOMP_NOUN_AFFIXEScliticsCOMP_PREFIX_LIST_TAGSprocletics_tagsprefixes_tagsCONJ_SUFFIX_LIST_TAGSsuffixes_tagsCOMP_SUFFIX_LIST_TAGSenclitics_tags)selfs    r   r1   znoun_affixer.__init__l   s    #,,T333.,-4,  #8 6!7r   c                    t          | g          }|t          j        t          j        z   t          j        t          j        z   t          j        t          j        t          j        z   t          j        z   fv r$| t          j        z   }|                    |           |r4|t          j        t          j        z   t          j        t          j        z   fv r$| t          j        z   }|                    |           | 	                    t          j                  r,| dd         t          j
        z   }|                    |           | 	                    t          j                  r,| dd         t          j        z   }|                    |           |}|S )u  
        Generate the Noun stem variants according to the affixes.
        For example مدرستي = >مدرست+ي = > مدرسة +ي.
        Return a list of possible cases.
        @param stem: the input stem.
        @type stem: unicode.
        @param suffix_nm: suffix (no mark).
        @type suffix_nm: unicode.
        @return: list of stem variants.
        @rtype: list of unicode.
        N)setr#   ALEFTEHYEHr%   addNOONr$   r"   ALEF_MAKSURAHAMZA	YEH_HAMZA)stemr*   possible_noun_listpossible_nounvalidated_lists        r   get_stem_variantsznoun_affixer.get_stem_variants   sN   " !"
   26)26BN+BBF"')BF24 4 4 2>1M""=111 	2I"&27*:BFRW<L)MMM 26MM""=111==   	2 "I7M""=111=="" 	2 "I4M""=111 ,r   Fc                    |}|}|                     t          j                  dk    r-|r+t          j        t          j        t          j        |          }nI|sGt          j        |          r3|dd         t          j        t          j        fv rd}n|rt          j	        }d| j
        |         d         v rt          j        |          }n|}||fS )u  
        Get the suffix variant to be joined to the word.
        For example: word = مدرس, suffix = ة, enclitic = ي.
        The suffix is converted to Teh.
        @param word: word found in dictionary.
        @type word: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @param mankous: if the noun is mankous ends with Yeh منقوص.
        @type mankous: boolean.
        @return: variant of suffixes  (vocalized suffix and vocalized
        suffix without I'rab short mark).
        @rtype: (unicode, unicode)
        r   rG   N u
   متحركtags)findr#   r%   resubrJ   	is_harakarK   rI   KASRATANrB   strip_lastharaka)rE   r   suffixencliticmankousr)   	newsuffixsuffix_non_irab_marks           r   get_suffix_variantsz noun_affixer.get_suffix_variants   s    $ 	;;r~&&!+++r~rvv>>II 	(f!5!5 	(BCCyRVRW---		 ( K	 D.v6v>>>#%#6y#A#A  #,  ...r   c                    | }|                      t          j        t          j        z             r|                    t          j                  r{|                     t          j        t          j        z   t          j                  }|                     t          j        t          j        z   t          j        t          j        z             } |                      t          j                  r|                    t          j        t          j        z             s|                    t          j                  r%t          j	        }t          j	        t          j
        z   } |                    t          j        t          j	        z   t          j
        z             sr|                    t          j        t          j	        z   t          j        z             s9|                    t          j        t          j	        z   t          j        z             rd}d} | |fS )u  
        Get the enclitix variant to be joined to the word.
        For example: word = كتاب, suffix = كسرة, enclitic = هم.
        The enclitic has a second form هِم.
        @param enclitic_voc: first level suffix vocalized.
        @type enclitic_voc: unicode.
        @param suffix_voc: second level suffix vocalized.
        @type suffix_voc: unicode.
        @return: variant of enclitic  (vocalized enclitic and vocalized
        enclitic without I'rab short mark).
        @rtype: (unicode, unicode)
        rW   )
startswithr#   HEHDAMMAr"   KASRAreplacerK   SUKUNSHADDAFATHA)enclitic_voc
suffix_vocencl_vo_no_inflect_marks      r   get_enclitic_variantz!noun_affixer.get_enclitic_variant   s    #/""26BH#455 	C*:M:M; ; 	C&2&:&:26BH;L;=6'C 'C#'//0A020AC CL ""26** 	#
 ""26"(?33 3z7J7J267R7R 3+-9'!y"(2
 ##BF29$4bh$>?? #""26")#3RX#=>>#""26")#3RX#=>># ,/'"444r   c                 J   | }t          j        |          }t          j        |          }||z   }t          j        |          }|                    t           j                  r|                    t           j        t           j        z             r|t           j        t           j        t           j        z   t           j        t           j        z   t           j        z   fv r|dd         t           j        z   }n.|t           j        t           j        z   k    rt          j        |dd                   }n|dk    r|dd         t           j        z   }n|t           j        t           j        z   t           j        t           j        z   t           j        t           j        t           j        z   t           j        z   fv rt          j        |dd                   }nR|dk    r|dd         t           j        z   }n3|                    t           j                  r>|dk    r|dd         t           j        z   }n|dk    r|dd         t           j        z   }n|                    t           j	        t           j        z             r|d|vr&d|vr"|s |st          j        |dd                   }n|t           j        t           j
        z   t           j        t           j
        z   fv rt          j        |dd                   }n.|                    t           j                  r|s|r	|                    t           j                  r|dd         t           j        z   }n|                    t           j	                  r|dd         t           j        z   }n|                    t           j        t           j        z             s9|                    t           j        t           j        z   t           j        z             r6|                    t           j                  r|dd         t           j        z   }|S )u  
        Get the word variant to be joined to the suffix.
        For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
        @param word: word found in dictionary.
        @type word: unicode.
        @param proclitic: proclitic ( first level).
        @type proclitic: unicode.
        @param suffix: suffix ( first level).
        @type suffix: unicode.
        @param enclitic: enclitic( second level).
        @type enclitic: unicode.
        @return: variant of word.
        @rtype: unicode.
        NrG   rW   r   u   لل)r#   strip_tashkeelr^   r"   r%   rI   rK   rJ   rN   ri   rM   r$   rO   rf   rh   	WAW_HAMZArP   rk   FATHATAN)r   r+   r_   r`   	word_stemr*   r)   long_suffix_nms           r   get_word_variantznoun_affixer.get_word_variant  s     	%f--	'11"[0'	22	 bn-- 3	:!!"'BN":;; 4".)@!#"'!1BF!:!< < < )#2# 7II"'BF"222 " 3IcrcN C CII#s** )#2# 7Irw/".1H"&!v/"&8: : : /	#2#??		  3&&%crcNRV3	00 	: C%crcNRV3		##%crcNRW4	26 122 	: i''G9,D,D[,Daj,D/	#2#??		 rv/"'1ABBB/	#2#??	 )) 	:y 	:K 	:  ** :%crcNR\9		""28,, :%crcNR\9		$$RVbh%677 :''(9BH(DEE:((55: &crcNR\9	r   c                 h   g }|}t          j        |dd                   r
|dd         }|                    t           j        t           j                  }|                    t           j        t           j        t           j        z             }|                    t           j        t           j        t           j        z             }t          j        dt           j        z  t           j        |          }t          j        dt           j        z  d|          }|	                    t           j
        t           j        z             rdnd}|                     ||||          }|                     ||||          \  }}| j                            |i                               d|          D ]}	| j                            |i                               d|          D ]}
|                     |
|          \  }
}d	| j        |         d
         v rst          j        |d                   rYd                    |d         t           j        |dd         g          }|		                    t           j                  r
|	dd         }	d                    |	|||g          }d                    |	|||
g          }d                    |	|||
g          }t          j        |          }t.          j        D ]1\  }}|                    ||          }|                    ||          }2|                    |||f           |S )a  
        Join the  noun and its affixes, and get the vocalized form
        @param noun: noun found in dictionary.
        @type noun: unicode.
        @param proclitic: first level prefix.
        @type proclitic: unicode.

        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: vocalized word.
        @rtype: unicode.
        rG   Nz(%s)+z^(%s)+rW   TF	vocalized
   تعريفrX   r   r   r   )r#   r\   rj   rv   rm   rI   rN   rZ   r[   r"   ri   rK   ry   rd   r?   getrD   rq   is_sunr
   rl   rk   rt   r2   AJUST_VOCAL_PATTERNSappend)rE   nounr+   r_   r`   word_tuple_listro   ra   rc   proclitic_vocrn   encl_voc_non_inflectword_non_irab_markword_vocalized	segmentedpatrnrepls                    r   vocalizeznoun_affixer.vocalizea  s     
 <RSS	"" 	9D ||BK22 ||BGRX%788 ||BORX-GHHvh)28T::vi"(*B55 --26(9::E$$$$T9fhGG
 ,0+C+C*h,1 ,1(
( "155iCCGGU^__ 	T 	TM $ 3 7 7 D D H HV^ _ _ * *595N5N &6* 6*222
  4Y ? GGGBIVZ[\V]L^L^Gxxa")T!""X >?? ))"(33 7$1#2#$6M
 "$&:<PQ"S "S
  WWmT:|%TUUN-z<!PQQI))44I"7 M Mt!/!7!7t!D!D%7%?%?t%L%L""""N4F	#RSSSSr   c                 :   g }t           j        | v r|                    |                     t           j        t           j        dz                       |                    |                     t           j        t           j        t           j        z                        |S )z generate noun varaintes    )r#   
ALEF_MADDAr   rj   ALEF_HAMZA_ABOVErO   rI   )r   	noun_lists     r   get_noun_variantsznoun_affixer.get_noun_variants  s     	=D  R]B,?!,CDDF F FR]BHbg,=>>@ @ @r   c                 z   g }|r|                      t          j                  r+|                    | dd         t          j        z              nl|                      t          j                  r*|                    | dd         t          j        z              n"|                    | t          j        z              |S )z generate stem varaintes NrG   )r"   r#   rI   r   rN   rJ   r%   rK   )rQ   r)   	list_stems      r   get_input_stem_variantsz$noun_affixer.get_input_stem_variants  s     	 	,}}RW%% =  crcR_!<====rv&& =  crcR^!;<<< TBF]+++r   c                 $      fd|D             S )ad  
        Verify possible affixes in the resulted segments according
        to the given affixes list.
        @param word: the input word.
        @type word: unicode.
        @param list_seg: list of word segments indexes (numbers).
        @type list_seg: list of pairs.
        @return: list of acceped segments.
        @rtype: list of pairs.
        c                 ~    g | ]9}d                      d|d                  |d         d         g          v 7|:S r   r	   r   s     r   r   z-noun_affixer.verify_affix.<locals>.<listcomp>  sW     
 
 
xxeqted1Q455k233zAA AAAr   r   r   s   ` `r   r   znoun_affixer.verify_affix  s4    
 
 
 
 

 
 
 	
r   rW   c           	      8   g }t          j        |          }|                     ||          rm|                     |||          rV|                     ||||          }|r<d |D             }|D ]-}|                    |                     ||||                     .|S )z generate noun form c                 ,    g | ]}t          |          S r   )list)r   xs     r   r   z)noun_affixer.get_form.<locals>.<listcomp>  s    #B#B#BDGG#B#B#Br   )r#   rt   is_valid_cliticscheck_clitic_affixr   r   get_tags)rE   r   procprefsuffencnewword_list
word_tuples           r   get_formznoun_affixer.get_form  s     &&  s++ 	Q&&tS$77 Q#}}T4sCC Q#B#B\#B#B#BL&2 Q Q
"))$--tT3*O*OPPPPr   c                     g }t          j        | j        | j        | j                  D ]J}|d         }|d         }|d         }|                     ||d||          }|r|                    |           K|S )z generate all possible affixesr   r   r   rW   )	itertoolsproductr4   r7   r9   r   extend)rE   r   
noun_formselementr   r   r   r   s           r   generate_formsznoun_affixer.generate_forms   s     
 (t~' ' 	0 	0G1:D1:D!*C==tRcBBL 0!!,///r   c                    g }| j                             |i                               dd          }|                    |           | j                            |i                               dd          }|                    |           | j                            |i                               dd          }|                    |           d |D             }d                    |          S )z+
        Get affixes tags
        
        rX   r   c                     g | ]}||S r   r   )r   ts     r   r   z)noun_affixer.get_tags.<locals>.<listcomp>"  s    ++++1+++r   :)r?   r}   r   rD   rB   r
   )	rE   r   	procleticr_   r`   taglistproclitic_tagsenclitic_tagssuffix_tagss	            r   r   znoun_affixer.get_tags  s    
 -11)R@@DDVBOO~&&&+//"==AA&LL}%%%(,,VR88<<VBGG{###
 ,+g+++xx   r   c                     g }|d         }|d         }|d         }|| j         vs|| j        vs	|| j        vrdgS |                     ||d||          }|S )z2 generate all possible word forms by given affixesr   r      )ZerroukitaharW   )r4   r7   r9   r   )rE   r   r;   r   r   r   r   s          r   generate_by_affixesz noun_affixer.generate_by_affixes%  su     
qzqzaj
 ''4t}+D+DSWSaHaHa'((]]4r$<<
r   Tc                     d|                                }|sd |D             }nd |D             }t          t          |                    }fd|D             }|S )z generate all affixes u   قصدc                 B    g | ]}t          j        |d                    S r   )r#   rt   r   ds     r   r   z4noun_affixer.generate_affix_list.<locals>.<listcomp>?  s'    IIIR.qt44IIIr   c                     g | ]
}|d          S r   r   r   s     r   r   z4noun_affixer.generate_affix_list.<locals>.<listcomp>A  s    666aQqT666r   c                 <    g | ]}|                     d           S )r   )rj   )r   r   r   s     r   r   z4noun_affixer.generate_affix_list.<locals>.<listcomp>E  s'    CCC4,,CCCr   )r   r   rH   )rE   r{   r   list_affixesr   s       @r   generate_affix_listz noun_affixer.generate_affix_list8  s    ((..
 	7IIjIIILL66:666LC--..CCCClCCCr   c                 h   | j                             |i                               dd          }| j                            |i                               dd          }| j                            |i                               dd          }|| j                            |i                               dd          z  }d|v rd|v rdS d|v rd|v rdS d|v rd|v rdS d|v rd	|v rdS d
|v rd|vrdS |                    t
          j                  r!|                    t
          j                  rdS dS )a  
        Verify if proaffixes (sytaxic affixes) are compatable
        with affixes ( conjugation)
        @param proclitic_nm: first level prefix.
        @type proclitic_nm: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @return: compatible.
        @rtype: True/False.
        rX   r   casesr|   r   Fu
   إضافةu   مضافu   لايضافu   جرu
   مجرورT)	r?   r}   rD   rB   rf   r#   rK   r"   rh   )rE   r(   r`   r_   r   r   r   s          r   r   znoun_affixer.check_clitic_affixH  sk   N -11,CCGGrRR+//"==AA&LL(,,VR88<<VBGG 	t)--fb99==grJJJ N**}/K/K5n,,+1M1ME M))m{.J.JEM))o.L.LE&&=+K+K5  (( 	V__RX-F-F 	5 Dr   N)F)rW   rW   rW   )T)__name__
__module____qualname__r1   staticmethodrU   rd   rq   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r.   r.   k   sg       8 8 8B & & \&R-/ -/ -/ -/^ (5 (5 \(5T M M \M^` ` `B 	 	 \	   \$ 
 
 \
       ! ! !* 35    &    W W W W Wr   r.   )__doc__rZ   syspprintpathr   r   pyarabic.arabyarabyr#   r0   aly_stem_noun_constr2   alyahmor.basic_affixeralyahmor.aly_stem_noun_constr   r,   r.   r   r   r   <module>r      s    
			 



               /%%%%%/222222........   7 7 7vt t t t t=. t t t t ts	   = A