
    -i                         d Z ddlZddlmZ ddlmZ ddlZddl	m
Z ddlZddlmZ ddlmZ ddlmZ ddlmZ  G d d	          Zd
 Zedk    r e             dS dS )z
Arabic noun stemmer
    N)arepr   )print_table)custom_dictionary)wordcasec                      e Zd ZdZddZd Zd Zd Zd Zd Z	d	 Z
d
 Zd Zed             Zedd            Zed             Zed             Zd Zed             Zed             Zed             Zed             ZdS )NounStemmerz
    Arabic noun stemmer
    Fc                    t           j                                        | _        | j                            t
          j                   | j                            t
          j                   t           j                                        | _	        | j	                            t
          j
                   | j	                            t
          j                   t          j                                        | _        d| _        t!          j        d          | _        t'          j        d          | _        i | _        i | _        i | _        i | _        || _        d| _        d S )NTnouns )
tashaphynestemmingArabicLightStemmercomp_stemmerset_prefix_listSNCCOMP_PREFIX_LISTset_suffix_listCOMP_SUFFIX_LISTconj_stemmerCONJ_PREFIX_LISTCONJ_SUFFIX_LISTalyahmornoun_affixer	generatorallow_syntax_lastmarkarabicdictionaryArabicDictionarynoun_dictionaryr   custom_noun_dictionarycache_dict_searchcache_affixes_verification
noun_cachenoun_vocalize_cachedebug
error_codeselfr%   s     O/var/www/html/speakWrite/venv/lib/python3.11/site-packages/qalsadi/stem_noun.py__init__zNounStemmer.__init__"   s   &/BBDD))#*>???))#*>???&/BBDD))#*>???))#*>??? ".;;==%)"  0@II&7&I'&R&R# "$*,'#% 
    c                     | j         S )z?
        Return error code when word is not recognized
        r&   r(   s    r)   get_error_codezNounStemmer.get_error_code@   s     r+   c                 &    | j         s	|| _         dS dS )z<
        set error code when word is not recognized
        Nr-   )r(   r&   s     r)   set_error_codezNounStemmer.set_error_codeH   s#      	)(DOOO	) 	)r+   c                     || j         v r| j         |         S | j                            |          }|| j                            |          z  }|| j         |<   |S )z)
        lookup for word in dict
        )r#   r   lookupr    )r(   wordresults      r)   lookup_dictzNounStemmer.lookup_dictO   s`     4?""?4(()0066Fd188>>>F$*DOD!r+   c                 |   |                      d           |s|                      d           dS | j        }g }|g|                     |          z   }t          t	          |                    }g }|D ]}| j                            |          }|                     ||t          j	                  }|D ]s}|d|d                  }	||d         |d                  }
||d         d         }|
g| 
                    |
|          z   }|D ]}
||
|	|d}|                    |           t|s|                      d           g }|rt          d           |r8t          t          |                     t          t          |                     |D ]X}| j                            |d	                   }|                     |d	         |t          j                  }|D ]
}|d	         d|d                  }|d	         |d         d         }t#          j        |          }|                     ||          }|D ]}
|                                }|
|d
<   ||d<   t          t	          t          j        |d                  d         t          j        |d                  d         z   t          j        |d                  d         z                       |d<   |                    |           Z|rt          d           |r8t          t          |                     t          t          |                     |s|                      d           |}g }|D ]}|d
         }|| j        v r| j        |         }n|                     |          }|| j        |<   |D ]H}|                                }|d         |d<   t5          |          |d<   |                    |           I|rt          d           |rDt          t          |                     d |D             }t          t          |                     |s|                      d           |}g }|D ]>}|                     |d         |d         |d         |d         |d                   rt          j        |d                  d         D ]}t          j        |d                  d         D ]}t          j        |d                  d         D ]}|                     |d         |||          rt          j        |         d         t          j        |         d         z   t          j        |         d         z   }|                                }||d<   ||d<   ||d<   |                    |           Č@|rt          d           |rDt          t          |                     d |D             }t          t          |                     |s|                      d           |}g }|D ]}|                     |d         d         |d         |d         |d                   }|D ]\  } }!}"g }#|d         d         dk    r|#                    d           d t          j        |d                  v rt          j        |d                  d          }$nd!}$|$D ]G}%|d         |%fz   }&d" |&D             }&|d         d#         }'|'d$v r.|d         d%         }(|(r|d         d%         })n|d         d         })n|d         d         })|                    t=          j        ||d         d|d         |d         f|d
         t#          j        |d                              d&d                    |d         d         |)| |!d'!                    |&          d'!                    d(|d         d)         g          |'|d         d*         d+d'!                    |#          dd,                     I|s|                      d-           |rt          d.           |rt          tE          |                     |S )/z
        Analyze word morphologically as noun
        @param noun_in: the input noun.
        @type noun_in: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        r   z
Empty wordNr   r   )noun	stem_compproencz First level segmentation errorzafter first levelr9   	stem_conjsuffixr:   tagsr;   
affix_tagszafter second levelz  Second level segmentation error	vocalizedoriginal
noun_tuplezafter lookup dictc                     g | ]
}|d          S rB    .0items     r)   
<listcomp>z-NounStemmer.stemming_noun.<locals>.<listcomp>       CCC$4-CCCr+   zNot exists in dictionarysuf_vocenc_voczafter check compatibilityc                     g | ]
}|d          S rD   rE   rF   s     r)   rI   z-NounStemmer.stemming_noun.<locals>.<listcomp>  rJ   r+   zAffixes not compatiblemankousTku
   منقوصcases)r   c                     g | ]}||S rE   rE   )rG   vacs     r)   rI   z-NounStemmer.stemming_noun.<locals>.<listcomp>-  s    %K%K%Kcs%Kc%K%K%Kr+   number   جمعu   جمع تكسيرsingleroot:Nounwordtypegenderfreqnoun)r4   affixstemrW   rA   lemmar@   semivocalizedr>   typerS   r[   freqoriginaltagssyntaxzForms are not generatedzafter generate result)#r1   r%   get_noun_variantslistsetr   segmentverify_affixr   COMP_NOUN_AFFIXESget_input_stem_variantsappendprintr   r   r   NOMINAL_CONJUGATION_AFFIXarnormalize_hamzaget_stem_variantscopyCOMP_PREFIX_LIST_TAGSCOMP_SUFFIX_LIST_TAGSCONJ_SUFFIX_LIST_TAGSr!   r6   dictvalidate_tags _NounStemmer__check_clitic_affixvocalizer   WordCasegetjoinlen)*r(   noun_inr%   detailed_result	noun_listword_segmented_listr8   list_seg_compsegproclitic_nmr^   enclitic_nm	list_stemword_segtmp_listlist_seg_conjseg_conjr<   r=   stem_conj_listword_seg_l2inf_nouninfnoun_foundlistrB   word_seg_l3noun_tuplespro_vocrL   rK   affix_tags_vocword_seg_l4voca_tuple_listr@   semi_vocalized_original_tags
list_casescasevoc_affix_caserS   single_formr_   s*                                             r)   stemming_nounzNounStemmer.stemming_noun[   s	    	B 	---4

""7++,	 Y((	  	9 	9D -55d;;M --mS%: M % 9 9#Hc!fH~CFSVO,"3q688n 00{CCD	
 & 9 9D $%)+*	   H (..x888899$ # 	C ABBB  	'%&&& 	4 %..!!!+122333+ $	1 $	1H !-55h{6KLLM !--%}c6S M
 * 1 1$[1-HQK-@	!+.x{}}=.y99	!%!7!7	6!J!J + 1 1D"*--//K/3K,,2K)045k%6HI&Q!7E8JKFST!7H8MNvVW 1 1K- OOK000011.  	(&''' 	)%..!!!+h''((( 	D BCCC&+ 	- 	-H  ,H 4111$($:8$D!!$($4$4X$>$>!3D&x0/ - -
&mmoo*4[*AJ',0,<,<L),,,,	-  	'%&&& 	,%..!!!CC(CCCK+k**+++  	< :;;;&+ "	= "	=H
 !!&&"  =  #8%I+V = =G#&#<Xe_#M#$ = = (+'@(AS'T'( = =G  $88 ( 6'    =
 %($=g$Fv$N&)&?&H&P%Q&)&?&H&P%Q !/
 /7mmoo9@I 69@I 6<JL 9 ( < < <#==,  	/-... 	,%..!!!CC(CCCK+k**+++ 	: 8999&+ @	 @	H #mm&{3##	 O 1@ 7 7,	>1 !#L))4<<!((666c78KLLL!$!:8I;N!OPW!XJJ!&J& , ,D%-l%;tg%EN%K%K^%K%K%KN%l3H=F!@@@&.|&<X&F& H$,\$:8$DEE$,\$:;$GEE ( 6{ C#** )(/$,UO$&$,Y$7$,Y$7	*" )1(=(*(:$,\$:$>$>vr$J$J)" )" -5\,B;,O(--61?(+(@(@(+%+Xl-CJ-O$P)" )" +1*2<*@*J(2030G0G*,1     ,7p  	; 9::: 	+)*** 	(#o&&'''r+   c                    |t           j        k    rY|d                             t           j                  s4|d                             t           j        t           j        z             sdS dt          j        |         d         v r
|d         sdS dt          j        |         d         v r
|d         rdS d                    |||t          t          |d                             g          }|| j        v r| j        |         S t          j        |         d         }t          j        |         d         }t          j        |         d         }|t          j        |                             d	d
          z  }d|v rd|v rd|vrd| j        |<   nid|v rd|v rd| j        |<   nVd|v rd|v rd| j        |<   nCd|v rd|v rd| j        |<   n0d|v rd|v rd| j        |<   nd|v rd|vrd| j        |<   n
d| j        |<   | j        |         S )a  
        Verify if proaffixes (sytaxic affixes) are compatable
        with affixes ( conjugation)
        @param proclitic_nm: first level prefix.
        @type proclitic_nm: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @return: compatible.
        @rtype: True/False.
        unvocalizedF   جمع مذكر سالمr>   masculin_plural
   تنوينmamnou3_sarf-rP   rE   u
   تعريفu   مضافu
   إضافةu   لايضافu   جرu
   مجرورT)ro   FATHATANendswithTEH_MARBUTAALEFHAMZAr   ru   r|   strboolr"   rs   rt   r{   )	r(   rB   r   encliticr=   r]   proclitic_tagsenclitic_tagssuffix_tagss	            r)   __check_clitic_affixz NounStemmer.__check_clitic_affixa  sq   " R[  }%..r~>> !-(11"'BH2DEE ! 5 '#*CF*KF*SSS01 T 5
 C5f=fEEE>* F 5 8VSj6P1Q1Q-R-RS
 
 D3332599 2<@H1(;FC/7? 	s08<<WbIIIN**k))-//5:D+E22^++0K0K5:D+E22^++0K0K5:D+E22
 =((\[-H-H5:D+E22=((^{-J-J5:D+E22~%%,k*I*I5:D+E22 6:D+E2.u55r+   c                     || _         dS )z
        Set the debug attribute to allow printing internal analysis results.
        @param debug: the debug value.
        @type debug: True/False.
        N)r%   r'   s     r)   	set_debugzNounStemmer.set_debug  s     


r+   c                     d| _         dS )zX
        Enable the syntaxic last mark attribute to allow use of I'rab harakat.
        TNr   r.   s    r)   enable_syntax_lastmarkz"NounStemmer.enable_syntax_lastmark  s     &*"""r+   c                     d| _         dS )zY
        Disable the syntaxic last mark attribute to allow use of I'rab harakat.
        FNr   r.   s    r)   disable_syntax_lastmarkz#NounStemmer.disable_syntax_lastmark  s     &+"""r+   c                    t          | g          }|t          j        t          j        z   t          j        t          j        z   t          j        t          j        t          j        z   t          j        z   fv r$| t          j        z   }|                    |           |t          j        t          j        z   t          j        t          j        z   fv rK|                     t          j                  r,| dd         t          j        z   }|                    |           |r4|t          j        t          j        z   t          j	        t          j        z   fv r$| t          j        z   }|                    |           |                     t          j                  r,| dd         t          j
        z   }|                    |           |                     t          j                  r,| dd         t          j        z   }|                    |           |}|S )u  
        Generate the Noun stem variants according to the affixes.
        For example مدرستي = >مدرست+ي = > مدرسة +ي.
        Return a list of possible cases.
        @param stem: the input stem.
        @type stem: unicode.
        @param suffix_nm: suffix (no mark).
        @type suffix_nm: unicode.
        @return: list of stem variants.
        @rtype: list of unicode.
        N)rg   ro   r   TEHYEHr   addNOONr   WAWALEF_MAKSURAr   	YEH_HAMZA)r^   	suffix_nmpossible_noun_listpossible_nounvalidated_lists        r)   rq   zNounStemmer.get_stem_variants  s   " !
 

 GbfFR^#FFRWrv%	
 
 
 !2>1M""=11127*BFRW,<===$--PRPVBWBW= "I6M""=111 	2I"&27*:BFRW<L)MMM 26MM""=111==   	2 "I7M""=111=="" 	2 "I4M""=111 ,r+   c                    |}|}|                     t          j                  dk    r-|r+t          j        t          j        t          j        |          }nI|sGt          j        |          r3| dd         t          j        t          j        fv rd}n|rt          j	        }dt          j        |         d         v rt          j        |          }n|}||fS )u  
        Get the suffix variant to be joined to the word.
        For example: word = مدرس, suffix = ة, enclitic = ي.
        The suffix is converted to Teh.
        @param word: word found in dictionary.
        @type word: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @param mankous: if the noun is mankous ends with Yeh منقوص.
        @type mankous: boolean.
        @return: variant of suffixes  (vocalized suffix and vocalized
        suffix without I'rab short mark).
        @rtype: (unicode, unicode)
        r   r   Nr   u
   متحركr>   )findro   r   resubr   	is_harakar   r   KASRATANr   ru   strip_lastharaka)r4   r=   r   rN   r   	newsuffixsuffix_non_irab_marks          r)   get_suffix_variantszNounStemmer.get_suffix_variants  s    & 	;;r~&&!+++r~rvv>>II 	(f!5!5 	(BCCyRVRW---		 ( K	 34V<VDDD#%#6y#A#A  #, ...r+   c                    | }|                      t          j        t          j        z             r|                    t          j                  r{|                     t          j        t          j        z   t          j                  }|                     t          j        t          j        z   t          j        t          j        z             } | |fS )u  
        Get the enclitix variant to be joined to the word.
        For example: word = كتاب, suffix = كسرة, enclitic = هم.
        The enclitic has a second form هِم.
        @param enclitic_voc: first level suffix vocalized.
        @type enclitic_voc: unicode.
        @param suffix_voc: second level suffix vocalized.
        @type suffix_voc: unicode.
        @return: variant of enclitic  (vocalized enclitic and vocalized
        enclitic without I'rab short mark).
        @rtype: (unicode, unicode)
        )
startswithro   HEHDAMMAr   KASRAreplace)enclitic_voc
suffix_vocencl_vo_no_inflect_marks      r)   get_enclitic_variantz NounStemmer.get_enclitic_variant*  s     #/""26BH#455 	V*:M:Mbh:W:W 	V&2&:&:26BH;Lbf&U&U#'//0A26BHCTUUL444r+   c                 J   | }t          j        |          }t          j        |          }||z   }t          j        |          }|                    t           j                  r|                    t           j        t           j        z             r|t           j        t           j        t           j        z   t           j        t           j        z   t           j        z   fv r|dd         t           j        z   }n.|t           j        t           j        z   k    rt          j        |dd                   }n|dk    r|dd         t           j        z   }n|t           j        t           j        z   t           j        t           j        z   t           j        t           j        t           j        z   t           j        z   fv rt          j        |dd                   }nR|dk    r|dd         t           j        z   }n3|                    t           j                  r>|dk    r|dd         t           j        z   }n|dk    r|dd         t           j        z   }n|                    t           j	        t           j        z             r|d|vr&d|vr"|s |st          j        |dd                   }n|t           j        t           j
        z   t           j        t           j
        z   fv rt          j        |dd                   }n.|                    t           j                  r|s|r	|                    t           j                  r|dd         t           j        z   }n|                    t           j	                  r|dd         t           j        z   }n|                    t           j        t           j        z             s9|                    t           j        t           j        z   t           j        z             r6|                    t           j                  r|dd         t           j        z   }|S )u  
        Get the word variant to be joined to the suffix.
        For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
        @param word: word found in dictionary.
        @type word: unicode.
        @param proclitic: proclitic ( first level).
        @type proclitic: unicode.
        @param suffix: suffix ( first level).
        @type suffix: unicode.
        @param enclitic: enclitic( second level).
        @type enclitic: unicode.
        @return: variant of word.
        @rtype: unicode.
        Nr   r      الu   لل)ro   strip_tashkeelr   r   r   r   r   r   r   r   r   r   r   r   r   	WAW_HAMZAr   SUKUNr   )r4   	procliticr=   r   	word_stemr   r   long_suffix_nms           r)   get_word_variantzNounStemmer.get_word_variant@  s     	%f--	'11"[0'	22	 bn-- @	:!!"'BN":;; 4FFR^+FRW$rv-!  
 !*#2# 7II"'BF"222 " 3IcrcN C CII#r)) )#2# 7I"& ' 26)	   /	#2#??		  2%%%crcNRV3	00 $	: B%crcNRV3		""%crcNRW4	26 122 	:
 i'')++# ,! , /	#2#??		 rv/"'1ABBB/	#2#??	 )) 		:y 		:K 		:  ** :%crcNR\9		""28,, :%crcNR\9		""26BH#455:%%bfrx&7"(&BCC: ##BK00: &crcNR\9	r+   c                     d                     ||||g          }|| j        vr%| j                            ||||          | j        |<   | j        |         S )a  
        Join the  noun and its affixes, and get the vocalized form
        @param noun: noun found in dictionary.
        @type noun: unicode.
        @param proclitic: first level prefix.
        @type proclitic: unicode.

        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: vocalized word.
        @rtype: unicode.
        r   )r|   r$   r   ry   r   rs   rt   r   ro   r   r   r   FATHAis_sunSHADDAr   r   r   r   r   r   r   r   r   r   r   AJUST_VOCAL_PATTERNS)r(   r8   r   r=   r   keyproclitic_vocr   encl_voc_non_inflectr   rN   r   word_non_irab_markword_vocalized	segmentedpatrnrepls                    r)   ry   zNounStemmer.vocalize  sd     hhi:;;d...,0N,C,Ci- -D$S) ',,r+   c                    g }t           j        | v r|                    |                     t           j        t           j        dz                       |                    |                     t           j        t           j        t           j        z                        |                    |                     t           j        t           j        t           j        z                        |S )zgenerate noun varaintes   )ro   
ALEF_MADDArl   r   ALEF_HAMZA_ABOVEr   r   )r8   r   s     r)   re   zNounStemmer.get_noun_variants  s     	=D  T\\"-9Lq9PQQRRRT\\"-BG9KLLMMMT\\"-9Lrw9VWWXXXr+   c                 z   g }|r|                      t          j                  r+|                    | dd         t          j        z              nl|                      t          j                  r*|                    | dd         t          j        z              n"|                    | t          j        z              |S )zgenerate stem varaintesNr   )r   ro   r   rl   r   r   r   r   )r^   r   r   s      r)   rk   z#NounStemmer.get_input_stem_variants  s     	 	,}}RW%% =  crcR_!<====rv&& =  crcR^!;<<< TBF]+++r+   c                 $      fd|D             S )ad  
        Verify possible affixes in the resulted segments according
        to the given affixes list.
        @param word: the input word.
        @type word: unicode.
        @param list_seg: list of word segments indexes (numbers).
        @type list_seg: list of pairs.
        @return: list of acceped segments.
        @rtype: list of pairs.
        c                 ~    g | ]9}d                      d|d                  |d         d         g          v 7|:S )r   Nr   r   )r|   )rG   s
affix_listr4   s     r)   rI   z,NounStemmer.verify_affix.<locals>.<listcomp>+  sS     
 
 
388T&AaD&\4!<,H#I#IZ#W#WA#W#W#Wr+   rE   )r4   list_segr   s   ` `r)   ri   zNounStemmer.verify_affix  s4    
 
 
 
 

 
 
 	
r+   c                 `   |}d|v r
| d         rdS | d         dv rd|v rdS d|v rdS d|v rdS |d	v r
| d
         sdS |dv r
| d         sdS d|vr
| d         sdS |                     d          r
| d         sdS |t          j        k    r
| d         sdS |t          j        k    r| d         dk    rdS dS )a~  
        Test if the given word from dictionary is compabilbe with affixes tags.
        @param noun_tuple: the input word attributes given from dictionary.
        @type noun_tuple: dict.
        @param affix_tags: a list of tags given by affixes.
        @type affix_tags:list.
        @param proclitic_nm: first level prefix vocalized.
        @type proclitic_nm: unicode.
        @param enclitic_nm: first level suffix vocalized.
        @type enclitic_nm: unicode.
        @param suffix_nm: first level suffix vocalized.
        @type suffix_nm: unicode.
        @return: if the tags are compatible.
        @rtype: Boolean.
        r   r   FrS   rT   u   جمع مؤنث سالمr   u   مثنى)u   همu   هنu   كماu   كمu   هما	hm_suffix)u   هu   ها	ha_suffixr   k_prefix
kal_prefixw_suffixrU   T)r   ro   r   r   )rB   r?   r   r   r   r   s         r)   rw   zNounStemmer.validate_tags/  s*   $ !	 :%%*^*D%5h#BBB)Z77u)Z77uZ''u GGG{+ H 5.((K1H(5"":j+A"5f%% 	j.F 	5 z*'=5&&:h+?8+K+K5( tr+   N)F)__name__
__module____qualname____doc__r*   r/   r1   r6   r   rx   r   r   r   staticmethodrq   r   r   r   ry   re   rk   ri   rw   rE   r+   r)   r	   r	      s           <  ) ) )
 
 
D D DLV6 V6 V6p  * * *+ + + 1 1 \1f &/ &/ &/ \&/P 5 5 \5* Z Z \Zxb= b= b=H   \   \$ 
 
 \
 G G \G G Gr+   r	   c                  P   g d} t                      }|                    d           | D ]B}|j                            |           t	          |j                                                   C| D ]}|                    |          }|D ]}|j                                        D ]d}t	          d	                    |t          |j        |                                       d          g                                        d           et	                       t	                       dS )z
    Test main)u   يضربu   الكتابu   الاستخدامu   فاستعمالهمu   ضربu   لأكلهمT	zunicode-escapeutf8N)r	   r   r   rh   rm   get_affix_listr   __dict__keysr|   reprdecodeencode)wordlistnounstemmerr4   r5   analyzedr   s         r)   mainlyr  z  sB     H --K$ 9 9 ((...k&55778888  **400 	 	H  (--// ! !		 $x'8'=">">"E"EFV"W"WX   &....GGGGGGG	 r+   __main__)r  r   pyarabic.arabyarabyro   pyarabic.arabreprr   tashaphyne.stemmingr   alyahmor.aly_stem_noun_constaly_stem_noun_constr   alyahmor.noun_affixerr   arramooz.arabicdictionaryr   print_debugr   r   r   r   r	   r  r   rE   r+   r)   <module>r     s    
			       # # # # # #     * * * * * *     4 4 4 4 4 4 $ $ $ $ $ $            Z Z Z Z Z Z Z Zz  F z
FHHHHH r+   