
    -i2                     X    d Z ddlmZ ddlZddlZddlmZ ddlm	Z
  G d d          ZdS )z
Arabic stop stemmer
    N   )stem_stopwords_constc                       e Zd ZdZd Zed             Zd Zed             Zed             Z	d Z
ed             Zed	             Zd
 ZdS )stopword_affixerz
    Arabic stop stemmer
    c                     t           j        | _        g | _        t           j        | _        t           j        | _        t           j        | _	        dS ) N)
SSCCOMP_PREFIX_LIST_TAGSprocletics_tagsprefixesCONJ_SUFFIX_LIST_TAGSsuffixes_tagsCOMP_SUFFIX_LIST_TAGSenclitics_tags	AJUSTMENTajustment_table)selfs    R/var/www/html/speakWrite/venv/lib/python3.11/site-packages/qalsadi/stop_affixer.py__init__zstopword_affixer.__init__   s>      #8 6!7
  #}    c                 t   t          | g          }|r4|t          j        t          j        z   t          j        t          j        z   fv r$| t          j        z   }|                    |           |                     t          j                  r,| dd         t          j        z   }|                    |           |}|S )u  
        Generate the Stop stem variants according to the affixes.
        For example مدرستي = >مدرست+ي = > مدرسة +ي.
        Return a list of possible cases.
        @param stem: the input stem.
        @type stem: unicode.
        @param suffix_nm: suffix (no mark).
        @type suffix_nm: unicode.
        @return: list of stem variants.
        @rtype: list of unicode.
        N)setarabyYEHNOONWAWaddendswithALEF_MAKSURA)stem	suffix_nmpossible_stop_listpossible_stopvalidated_lists        r   get_stem_variantsz"stopword_affixer.get_stem_variants/   s    " !
 

  	2II
"I
"*
 
 
 !59,M""=111==## 	2 "I(::M""=111+r   c                    t          j        |          }|}|sC|dd         t           j        t           j        t           j        fv rt          j        |          rd}d| j        |         d         v rt          j        |          }n|}||fS )u$  
        Get the suffix variant to be joined to the word.
        For example: word = مدرس, suffix = ة, encletic = ي.
        The suffix is converted to Teh.
        @param word: word found in dictionary.
        @type word: unicode.
        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: variant of suffixes  (vocalized suffix and vocalized
        suffix without I'rab short mark).
        @rtype: (unicode, unicode)
        r   N 
   متحركtags)r   strip_tashkeelr    r   ALEF	is_harakar   strip_lastharaka)r   wordsuffixencliticenclitic_nm	newsuffixsuffix_non_irab_marks          r   get_suffix_variantsz$stopword_affixer.get_suffix_variantsR   s     *844	 	RSS	e0%)UZHHH'' I I
 4-f5f===#(#9)#D#D  #, ...r   c                 t   |                     t          j                  r6|                     t          j                  r|dd         t          j        z   }|                     t          j        t          j        z             r6|                     t          j                  r|dd         t          j        z   }|S )u  
        Get the enclitic variant to be joined to the word.
        For example: word = عن, suffix = , encletic = ني.
        The word and enclitic are geminated.
        @param word: word found in dictionary.
        @type word: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: variant of suffixes  (vocalized suffix and vocalized
        suffix without I'rab short mark).
        @rtype: (unicode, unicode)
        r   N)
startswithr   r   r   SHADDAKASRAr   )r/   r1   s     r   get_enclitic_variantsz&stopword_affixer.get_enclitic_variantsu   s    $ uz** 	3t}}UZ/H/H 	3|el2Hu{UY677 	3DMM%)<T<T 	3|el2H r   c                    | }t          j        |          }|                    t           j                  rE|rC|dk    r|dd         t           j        z   }n|dd         t           j        z   t           j        z   }n|                    t           j                  ro|rm|                    t           j	                  r|dd         t           j
        z   }n6|                    t           j                  r|dd         t           j        z   }|r(|d         t           j        v rt          j        |          }|                    t           j                  rA|                     t           j        t           j        z             rt          j        |          }nl|                    t           j        t           j        z             r@|                     t           j        t           j        z             rt          j        |          }|S )uz  
        Get the word variant to be joined to the suffix.
        For example: word = مدرسة, suffix = ي. The word is converted to مدرست.
        @param word: word found in dictionary.
        @type word: unicode.
        @param suffix: suffix ( firts or second level).
        @type suffix: unicode.
        @return: variant of word.
        @rtype: unicode.
        u
   سِوَىNr   r   )r   r+   r   r    r,   r   SUKUNHAMZAr7   DAMMA	WAW_HAMZAr9   	YEH_HAMZAHARAKATr.   r   )r/   r0   	word_stemr"   s       r   get_word_variantz!stopword_affixer.get_word_variant   s    	(00	 e011 
	=i 
	=L((%crcNUZ7		%crcNUY6D		,, 	= 	=  -- =%crcNU_<		""5;// =%crcNU_<	  	:fQi5=00.y99I UZ(( 	:T]]5:;S-T-T 	:.y99IIu{UY677 	:DMMI#=
 =
 	: .y99Ir   c                    | j         |         d         d         }| j        |         d         d         }|}|                     |||z             }|                     |||          \  }}|                     ||          }d                    ||||g          }	d                    ||||g          }
|                     |	          }	|                     |
          }
|
|	fS )a  
        Join the  stop and its affixes, and get the vocalized form
        @param stop: stop found in dictionary.
        @type stop: unicode.
        @param proclitic: first level prefix.
        @type proclitic: unicode.

        @param suffix: second level suffix.
        @type suffix: unicode.
        @param enclitic: first level suffix.
        @type enclitic: unicode.
        @return: vocalized word.
        @rtype: unicode.
        	vocalizedr   r(   )r   r   rC   r5   r:   joinajust_vocalization)r   stop	procliticr0   r1   proclitic_vocenclitic_voc
suffix_vocr4   word_non_irab_markword_vocalizeds              r   vocalizezstopword_affixer.vocalize   s     ,Y7DQG
 *84[A!D
 $$T6H+<==
 ,0+C+C*l,
 ,
(
( 11$EE
  WWD"6E
 
 -z<!PQQ "445GHH00@@111r   c                 $      fd|D             S )ad  
        Verify possible affixes in the resulted segments according
        to the given affixes list.
        @param word: the input word.
        @type word: unicode.
        @param list_seg: list of word segments indexes (numbers).
        @type list_seg: list of pairs.
        @return: list of acceped segments.
        @rtype: list of pairs.
        c                 ~    g | ]9}d                      d|d                  |d         d         g          v 7|:S )-Nr   r   )rF   ).0s
affix_listr/   s     r   
<listcomp>z1stopword_affixer.verify_affix.<locals>.<listcomp>   sS     
 
 
388T&AaD&\4!<,H#I#IZ#W#WA#W#W#Wr    )r/   list_segrU   s   ` `r   verify_affixzstopword_affixer.verify_affix   s4    
 
 
 
 

 
 
 	
r   c                    t          j        |          }d|v r
| d         sdS d|v r
| d         rdS d|v r| d         rd|vrdS d|v r
| d         sdS d	|v r
| d         sdS d
|v r
| d         sdS d
|v r
| d         rdS |t           j        k    r
| d         sdS d|v r/| d         s%| d                             t           j                  rdS d|v r
| d         sdS d|v r
| d         sdS d|v r
| d         sdS dS )a  
        Test if the given word from dictionary is compabilbe with affixes tags.
        @param stop_tuple: the input word attributes given from dictionary.
        @type stop_tuple: dict.
        @param affix_tags: a list of tags given by affixes.
        @type affix_tags:list.
        @param procletic: first level prefix vocalized.
        @type procletic: unicode.
        @param encletic_nm: first level suffix vocalized.
        @type encletic_nm: unicode.
        @return: if the tags are compaatible.
        @rtype: Boolean.
        u
   تعريف
definitionFdefinedu   جرis_inflectedu
   مجرورprepositionr)   u   مضافpronounu
   وقايةr/   u   استفهامinterrogu   قسمqasamu
   تنوينtanwinT)r   r+   r   r   )
stop_tuple
affix_tags	procleticencletic_nms       r   validate_tagszstopword_affixer.validate_tags  s    (33	 :%%j.F%5:%%*Y*?%5 j  >* ! J..5Z
=(A5:%%j.H%5##Jy,A#5##
9(=#5%)##J~,F#5 :%%~& &*4V*<*E*Eei*P*P & 5z))*Z2H)5 z!!*W*=!5
 :%%j.B%5 tr   c                 <    | j                             ||          }|S )z
        ajust vocalization
        Temporary function
        @param vocalized: vocalized word.
        @type vocalized: unicode.
        @return: ajusted vocalized word.
        @rtype: unicode.
        )r   get)r   rE   ajusteds      r   rG   z#stopword_affixer.ajust_vocalizationE  s!     &**9i@@r   N)__name__
__module____qualname____doc__r   staticmethodr&   r5   r:   rC   rO   rY   rg   rG   rW   r   r   r   r      s         - - -"     \ D!/ !/ !/F   \4 ( ( \(T52 52 52n 
 
 \
 @ @ \@D    r   r   )rn   pyarabic.arabyr   tashaphyne.stemming
tashaphynetashaphyne.normalize!arramooz.stopwordsdictionaryclassstopwordsdictionaryclassr(   r   r	   r   rW   r   r   <module>rv      s                   D D D D D D ) ) ) ) ) )w w w w w w w w w wr   