
    -i}g                     p    d Z ddlmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ  G d d          ZdS )	z
Arabic stop stemmer
    N   )stem_stopwords_const)wordcase)stopword_affixerc                       e Zd ZdZddZd Zd Zd Zd Zd Z	d	 Z
ed
             Zed             Zed             Zed             Zd Zed             Zed             Zed             Zed             ZdS )StopWordStemmerz
    Arabic stop stemmer
    Fc                 L   t           j                                        | _        | j                            t
          j                   | j                            t
          j                   t           j                                        | _	        | j	                            t
          j
                   | j	                            t
          j                   t          j                    | _        d| _        t!          j        d          | _        i | _        i | _        || _        d S )NTclassedstopwords)
tashaphynestemmingArabicLightStemmercomp_stemmerset_prefix_listssconstCOMP_PREFIX_LISTset_suffix_listCOMP_SUFFIX_LISTconj_stemmerCONJ_PREFIX_LISTCONJ_SUFFIX_LISTstop_affixerr   	generatorallow_syntax_lastmarkstopwordsdictionaryclassStopWordsDictionarystop_dictionarycache_dict_searchcache_affixes_verificationdebugselfr   s     O/var/www/html/speakWrite/venv/lib/python3.11/site-packages/qalsadi/stem_stop.py__init__zStopWordStemmer.__init__#   s    &/BBDD))'*BCCC))'*BCCC&/BBDD))'*BCCC))'*BCCC &688 &*"  8K 
  

 "$*,'


    c                    g }|g}t           j        |v r@|                    |                    t           j        t           j        dz                       |D ]}| j                            |          }|                     ||t          j	                  }|D ]}|d|d                  }||d         |d                  }||d         d         }|g}	|r`|
                    t           j                  r+|	                    |dd         t           j        z              n|
                    t           j                  r+|	                    |dd         t           j        z              nI|
                    t           j                  r*|	                    |dd         t           j        z              |                    t           j                  r#|	                    |t           j        z              nA|                    t           j                  r"|	                    |t           j        z              |	D ]-}|                    |                     ||||                     .ѐ|S )z
        Analyze word morphologically as stop
        @param stop: the input stop.
        @type stop: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
           Nr   r   )araby
ALEF_MADDAappendreplaceALEF_HAMZA_ABOVEr   segmentverify_affixr   COMP_STOPWORDS_AFFIXESendswithALEFALEF_MAKSURAYEHTEHTEH_MARBUTA
startswithNOONextendsteming_second_level)
r!   stopdetailed_result	stop_listlist_seg_compseg	procleticstemencletic_nm	list_stems
             r"   stemming_stopwordz!StopWordStemmer.stemming_stopword@   sf    
	 t##T\\%*:E<RUV<VWWXXX 	 	D -55d;;M --mW%C M
 %   3q6N	CFSVO,"3q688n "F	 <}}UZ00 H!((crcU5G)GHHHHuy11 H!((crcU5G)GHHHHuy11 H!((crcU5F)FGGG"--ei88 <!((	)9::::$//
;; <!((
):;;; &  D#**11$iUU   +2 r$   c                    g }| j                             |          }|                     ||t          j                  }|D ]}}||d         |d                  }||d         d         }	|                     ||	          }
g }t          |
          D ]e}|| j        vr8| j        	                    |          }| 
                    |          | j        |<   n| j        |         }|                    |           f|D ]}|d         }t          j        |         d         D ]}t          j        |	         d         D ]}t          j        |         d         t          j        |         d         z   t          j        |         d         z   }|                     ||||          }|                     ||||          }|r|r|                     ||||          }|d         d         |d         d         }}|                     |          }|d         dk    rdnd}d	                    |d
         |d         ||d         g          }|                    t+          j        ||d||f|||||d	                    |          d	                    d|d
         g          d||d         |d         |d         dd                     Ő|S )aC  
        Analyze word morphologically by stemming the conjugation affixes.
        @param stop: the input stop.
        @type stop: unicode.
        @param stop2: the stop stemed from syntaxic affixes.
        @type stop2: unicode.
        @param procletic: the syntaxic prefixe extracted in the fisrt stage.
        @type procletic: unicode.
        @param encletic_nm: the syntaxic suffixe extracted in the
        first stage (not vocalized).
        @type encletic_nm: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        r   r   N	vocalizedtagsis_inflectedu   مبنيu   معرب:	word_type
word_classaction STOPWORDfreqstopwordobject_typeneed)wordaffixr@   originallemmarE   semivocalizedrF   typefreqoriginaltagsrK   rO   rP   syntax)r   r-   r.   r   STOPWORDS_CONJUGATION_AFFIXget_stem_variantssetr   r   lookupcreate_dict_wordr8   COMP_SUFFIX_LIST_TAGSCONJ_SUFFIX_LIST_TAGSCOMP_PREFIX_LIST_TAGSvalidate_tagsis_compatible_proaffix_affixvocalizeajust_vocalizationjoinr*   r   WordCase)r!   r:   stop2r?   rA   r;   list_seg_conjseg_conj	stem_conjsuffix_conj_nmpossible_stop_listinfstop_form_listinfstopinfstop_foundlist
stop_tuplerS   vocalized_encleticvocalized_suffix
affix_tagsvalid
compatiblevocalized_listrE   semi_vocalizedrG   original_tagss                             r"   r9   z$StopWordStemmer.steming_second_levels   s    )11%88))='"E
 
 & n	 n	HhqkHQK78I"8A;==1N "&!7!7	>!R!R !#122 < < $"888(,(<(C(CG(L(L%6:6K6K)7 7D*733 )-(>w(G%!(():;;;;/ T T
%k2 +2*G*T+ K K& -4,I&-!-# H H(
 $9)DVL%;<NOPVWX%;<LMfUV # !% 2 2&
I{! ! &*%F%F&	3EGW& &
 ! 8Z 8-1]] ( ) 0 2	. .N !/q 1! 4 .q 1! 4 (6I )-(?(?	(J(JI /9.HA.M.M

S] )
 -0HH$.{$;$.|$<$0$.x$8	!"- -M ,22 ( 104,5,.,<,>	2* 1:4<195>9G030D0D03-7K9P,Q1* 1* 1?8E2<X2F7A-7P0:60B24-%& %&!" !"  [HKTj r$   c                    d                     |||g          }|| j        v r| j        |         S t          j        |         d         }t          j        |         d         }t          j        |         d         }d|v rd|v rd| j        |<   nd|v rd|v rd| j        |<   nd|v rd|v rd| j        |<   nd|v rd|v rd| j        |<   nt          d|v           rt          d	|v           rd| j        |<   njt          d	|v           r"t          d
|d         v           sd| j        |<   n7t          d	|v           rt          d	|v           sd| j        |<   n
d| j        |<   | j        |         S )a  
        Verify if proaffixes (sytaxic affixes) are compatable
        with affixes ( conjugation)
        @param procletic: first level prefix.
        @type procletic: unicode.
        @param encletic: first level suffix.
        @type encletic: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @return: compatible.
        @rtype: True/False.
        -rF   
   تعريف   مضافF
   تنوينu   لايضافu
   لايجر
   مجرور   جرrJ   T)rf   r   r   ra   r_   r`   bool)	r!   rq   r?   encleticsuffixrR   procletic_tagsencletic_tagssuffix_tagss	            r"   rc   z,StopWordStemmer.is_compatible_proaffix_affix   s    )Xv677D3332599 !6yA&I5h?G3F;FC>))jK.G.G5:D+E22^++0K0K5:D+E22 =((\[-H-H5:D+E22=((^{-J-J5:D+E22 ,-/00 
	:T,+:U5V5V 
	:5:D+E22,-/00 	:j..:
 :
 	: 6;D+E22,-/00 	:lk>Y9Z9Z 	:5:D+E22 6:D+E2.u55r$   c                     || _         dS )z
        Set the debug attribute to allow printing internal analysis results.
        @param debug: the debug value.
        @type debug: True/False.
        N)r   r    s     r"   	set_debugzStopWordStemmer.set_debug4  s     


r$   c                     d| _         dS )zX
        Enable the syntaxic last mark attribute to allow use of I'rab harakat.
        TNr   r!   s    r"   enable_syntax_lastmarkz&StopWordStemmer.enable_syntax_lastmark<  s     &*"""r$   c                     d| _         dS )zY
        Disable the syntaxic last mark attribute to allow use of I'rab harakat.
        FNr   r   s    r"   disable_syntax_lastmarkz'StopWordStemmer.disable_syntax_lastmarkB  s     &+"""r$   c                 t   t          | g          }|r4|t          j        t          j        z   t          j        t          j        z   fv r$| t          j        z   }|                    |           |                     t          j                  r,| dd         t          j        z   }|                    |           |}|S )u  
        Generate the Stop stem variants according to the affixes.
        For example مدرستي = >مدرست+ي = > مدرسة +ي.
        Return a list of possible cases.
        @param stem: the input stem.
        @type stem: unicode.
        @param suffix_nm: suffix (no mark).
        @type suffix_nm: unicode.
        @return: list of stem variants.
        @rtype: list of unicode.
        Nr'   )r\   r(   r3   r7   WAWaddr0   r2   )r@   	suffix_nmrm   possible_stopvalidated_lists        r"   r[   z!StopWordStemmer.get_stem_variantsH  s    " !
 

  	2II
"I
"*
 
 
 !59,M""=111==## 	2 "I(::M""=111+r$   c                 "   t          j        |          }|}|sC| dd         t           j        t           j        t           j        fv rt          j        |          rd}dt          j        |         d         v rt          j        |          }n|}||fS )u$  
        Get the suffix variant to be joined to the word.
        For example: word = مدرس, suffix = ة, encletic = ي.
        The suffix is converted to Teh.
        @param word: word found in dictionary.
        @type word: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: variant of suffixes  (vocalized suffix and vocalized
        suffix without I'rab short mark).
        @rtype: (unicode, unicode)
        r'   NrL   
   متحركrF   )	r(   strip_tashkeelr2   r3   r1   	is_harakar   r`   strip_lastharaka)rQ   r   encliticenclitic_nm	newsuffixsuffix_non_irab_marks         r"   get_suffix_variantsz#StopWordStemmer.get_suffix_variantsk  s      *844	 	RSS	e0%)UZHHH'' I I
 78@HHH#(#9)#D#D  #, ...r$   c                 t   |                     t          j                  r6|                     t          j                  r|dd         t          j        z   }|                     t          j        t          j        z             r6|                     t          j                  r|dd         t          j        z   }|S )u  
        Get the enclitic variant to be joined to the word.
        For example: word = عن, suffix = , encletic = ني.
        The word and enclitic are geminated.
        @param word: word found in dictionary.
        @type word: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: variant of suffixes  (vocalized suffix and vocalized
        suffix without I'rab short mark).
        @rtype: (unicode, unicode)
        r   N)r6   r(   r7   r0   SHADDAKASRAr3   )rQ   r   s     r"   get_enclitic_variantsz%StopWordStemmer.get_enclitic_variants  s    $ uz** 	3t}}UZ/H/H 	3|el2Hu{UY677 	3DMM%)<T<T 	3|el2H r$   c                    | }t          j        |          }|                    t           j                  rE|rC|dk    r|dd         t           j        z   }n|dd         t           j        z   t           j        z   }n|                    t           j                  ro|rm|                    t           j	                  r|dd         t           j
        z   }n6|                    t           j                  r|dd         t           j        z   }|r(|d         t           j        v rt          j        |          }|                    t           j                  rA|                     t           j        t           j        z             rt          j        |          }nl|                    t           j        t           j        z             r@|                     t           j        t           j        z             rt          j        |          }|S )uz  
        Get the word variant to be joined to the suffix.
        For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
        @param word: word found in dictionary.
        @type word: unicode.
        @param suffix: suffix ( firts or second level).
        @type suffix: unicode.
        @return: variant of word.
        @rtype: unicode.
        u
   سِوَىNr'   r   )r(   r   r0   r2   r1   r3   SUKUNHAMZAr6   DAMMA	WAW_HAMZAr   	YEH_HAMZAHARAKATr   r7   )rQ   r   	word_stemr   s       r"   get_word_variantz StopWordStemmer.get_word_variant  s    	(00	 e011 
	=i 
	=L((%crcNUZ7		%crcNUY6D		,, 	= 	=  -- =%crcNU_<		""5;// =%crcNU_<	  	:fQi5=00.y99I UZ(( 	:T]]5:;S-T-T 	:.y99IIu{UY677 	:DMMI#=
 =
 	: .y99Ir$   c                 <    | j                             ||||          S )a  
        Join the  stop and its affixes, and get the vocalized form
        @param stop: stop found in dictionary.
        @type stop: unicode.
        @param proclitic: first level prefix.
        @type proclitic: unicode.

        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: vocalized word.
        @rtype: unicode.
        )	r   rd   r   ra   r_   r   r   r   rf   )r!   r:   	procliticr   r   proclitic_vocenclitic_voc
suffix_vocr   word_non_irab_markword_vocalizeds              r"   rd   zStopWordStemmer.vocalize  s     " ~&&tYIIIr$   c                 $      fd|D             S )ad  
        Verify possible affixes in the resulted segments according
        to the given affixes list.
        @param word: the input word.
        @type word: unicode.
        @param list_seg: list of word segments indexes (numbers).
        @type list_seg: list of pairs.
        @return: list of acceped segments.
        @rtype: list of pairs.
        c                 ~    g | ]9}d                      d|d                  |d         d         g          v 7|:S )r{   Nr   r   )rf   ).0s
affix_listrQ   s     r"   
<listcomp>z0StopWordStemmer.verify_affix.<locals>.<listcomp>  sS     
 
 
388T&AaD&\4!<,H#I#IZ#W#WA#W#W#Wr$    )rQ   list_segr   s   ` `r"   r.   zStopWordStemmer.verify_affix  s4    
 
 
 
 

 
 
 	
r$   c                    t          j        |          }d|v r
| d         sdS d|v r
| d         rdS d|v r| d         rd|vrdS d|v r
| d         sdS d	|v r
| d         sdS d
|v r
| d         sdS d
|v r
| d         rdS |t           j        k    r
| d         sdS d|v r/| d         s%| d                             t           j                  rdS d|v r
| d         sdS d|v r
| d         sdS d|v r
| d         sdS dS )a  
        Test if the given word from dictionary is compabilbe with affixes tags.
        @param stop_tuple: the input word attributes given from dictionary.
        @type stop_tuple: dict.
        @param affix_tags: a list of tags given by affixes.
        @type affix_tags:list.
        @param procletic: first level prefix vocalized.
        @type procletic: unicode.
        @param encletic_nm: first level suffix vocalized.
        @type encletic_nm: unicode.
        @return: if the tags are compaatible.
        @rtype: Boolean.
        r|   
definitionFdefinedr   rG   r   prepositionr   r}   pronounu
   وقايةrQ   u   استفهامinterrogu   قسمqasamr~   tanwinT)r(   r   r3   r0   )rq   rt   r?   rA   s       r"   rb   zStopWordStemmer.validate_tags  s    (33	 :%%j.F%5:%%*Y*?%5 j  >* ! J..5Z
=(A5:%%j.H%5##Jy,A#5##
9(=#5%)##J~,F#5 :%%~& &*4V*<*E*Eei*P*P & 5z))*Z2H)5 z!!*W*=!5
 :%%j.B%5 tr$   c                     | S )a  
        Create a list of dictWord objects from dictionary entries
        @param dict_entries_list: a list of entiers from lexicon
        @type  dict_entries_list: list of dict
        @return: a list of dictWord object
        @rtype: a list of dictWord object
        r   )dict_entries_lists    r"   r^   z StopWordStemmer.create_dict_word_  s
     ! r$   c                 F    t           j                            | |           }|S )z
        ajust vocalization
        Temporary function
        @param vocalized: vocalized word.
        @type vocalized: unicode.
        @return: ajusted vocalized word.
        @rtype: unicode.
        )r   	AJUSTMENTget)rE   ajusteds     r"   re   z"StopWordStemmer.ajust_vocalizationj  s!     #''	9==r$   N)F)__name__
__module____qualname____doc__r#   rC   r9   rc   r   r   r   staticmethodr[   r   r   r   rd   r.   rb   r^   re   r   r$   r"   r   r      s           :1 1 1fH H HT56 56 56n  * * *+ + +     \ D !/ !/ \!/F   \4 ( ( \(T52 52 52n 
 
 \
 @ @ \@D ! ! \!   \  r$   r   )r   pyarabic.arabyr(   tashaphyne.stemmingr   tashaphyne.normalizerL   r   r   !arramooz.stopwordsdictionaryclassr   r   alyahmorr   r   r   r   r$   r"   <module>r      s                   - - - - - - D D D D D D       5 5 5 5 5 5
X	 X	 X	 X	 X	 X	 X	 X	 X	 X	r$   