
    -iw                     $   d Z edk    rddlZej                            d           ddlZddlmZ ddlm	Z	 ddl
ZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ  G d d          ZdS )zm
Arabic text morphological analyzer.
Provides routins  to alanyze text.
Can treat text as verbs or as nouns.
__main__    Nz..   )analex_const)	stem_noun)	stem_verb)stem_unknown)	stem_stop)stem_pounct_const)disambig)wordcase)stemmedwordcache)cache_pickledbc                       e Zd ZdZ	 d&dZd Zd Zd Zd Zd'd
Z	d Z
d Zd Zd Zd Zd Zd Zd(dZd Zd Zd)dZd Zd Zd'dZd Zd Zd Zd Zd Zd  Zd! Zd" Ze d*d#            Z!d$ Z"e d%             Z#dS )+Analexz}
    Arabic text morphological analyzer.
    Provides routins  to alanyze text.
    Can treat text as verbs or as nouns.
    FTc                 Z   t          j                    | _        t          j                    | _        t          j                    | _        t          j
                    | _        || _        |o|| _        d| _        | j        r#t          j                                        | _        | j        rt'          j                    | _        d| _        d| _        d| _        d                    t4          j                  }t9          j        dd                    t4          j                  dt8          j                  | _        d| _         tC          j"        dtB          j#                  | _$        d| _%        || _&        |rtO          j(        |          | _'        nd	| _'        d| _)        d| _*        i | _+        d	S )
z)
        Create Analex instance.
        TFi'  r    z([\wz\s]+)wordfreqN),r   NounStemmernounstemmerr   VerbStemmerverbstemmerr   UnknownStemmerunknownstemmerstem_stopwordsStopWordStemmerstopwordsstemmerallow_tag_guessingallow_disambiguationallow_syntax_lastmark	naftawayhwordtag
WordTaggertaggerr   DisambiguatordisambiguatordebuglimitwordcounterjoinarabyTASHKEELrecompileUNICODEclause_patternpartial_vocalization_supportwordfreqdictionaryclassWordFreqDictionaryWORDFREQ_DICTIONARY_INDEXr   allow_cache_use
cache_pathr   Cachefully_vocalized_input
error_codewordfreq_cache)selfr7   r   r    markss        L/var/www/html/speakWrite/venv/lib/python3.11/site-packages/qalsadi/analex.py__init__zAnalex.__init__I   s    %022$022*9;; . > @ @ #5 %9$O=O! &*"" 	9#+6688DK$ 	:!)!7!9!9D

 '' !jj ggen55557
 
 -1) 0B/I
 
  %$ 	Z00DJJDJ &+"     c                 h    d| _         d| _        d| _        d| _        d| _        d| _        d| _        dS )z1
        Delete instance and clear cache
        N)r   r   r   r   r   r%   r'   r<   s    r>   __del__zAnalex.__del__   s@     " $!r@   c                 0    | xj         dz  c_         | j         S )zqcount input words. Used just for profiling and tests.
        @return: counter.
        @rtype: integer.
        r   )r*   rB   s    r>   
count_wordzAnalex.count_word   s!     	Ar@   c                 |    || j         v r| j         |         S | j                            ||          }|| j         |<   |S )z'
        Return word_frequency
        )r;   r   get_freq)r<   wordwordtypefreqs       r>   rG   zAnalex.get_freq   sI     4&&&&t,,=))$99D(,D%Kr@   c                 n    d| j                                         d| j                                        S )z?
        Return error code when word is not recognized
        Nz-V)r   get_error_coder   rB   s    r>   rM   zAnalex.get_error_code   s=      ++----++---
 	
r@   r   c                 *    t          j        |          S )z
        Tokenize text into words
        @param text: the input text.
        @type text: unicode.
        @return: list of words.
        @rtype: list.
        )r,   tokenizer<   texts     r>   rO   zAnalex.tokenize   s     ~d###r@   c                 "   |r| j                             |          }|rnd}g }|D ]e}| j                             |          s/|dk     r|                    |           d}:||xx         |z  cc<   K|                    |           |dz  }f|S g S g S )z
        Split Text into clauses
        @param text: input text
        @type text: unicode
        @return: list of clauses
        @rtype: list of unicode
        r   r   )r1   splitmatchappend)r<   rQ   list_phrasejnewlistphrs         r>   split_into_phraseszAnalex.split_into_phrases   s      	-33D99K &  C.44S99  q55#NN3/// !AA $AJJJ#-JJJJs+++Q		r@   c                 H    |                      |          }d |D             }|S )z
        Tokenize text into words, after treatement.
        @param text: the input text.
        @type text: unicode.
        @return: list of words.
        @rtype: list.
        c                     g | ]}||S  r^   ).0rH   s     r>   
<listcomp>z(Analex.text_tokenize.<locals>.<listcomp>   s    888d48T888r@   )rO   )r<   rQ   	list_words      r>   text_tokenizezAnalex.text_tokenize   s.     MM$''	88i888	r@   c                 |    || _         | j                            |           | j                            |           dS )z
        Set the debug attribute to allow printing internal analysis results.
        @param debug: the debug value.
        @type debug: True/False.
        N)r(   r   	set_debugr   )r<   r(   s     r>   rd   zAnalex.set_debug   s?     
""5)))""5)))))r@   c                 x    d| _         | j                                         | j                                         dS )zX
        Enable the syntaxic last mark attribute to allow use of I'rab harakat.
        TN)r!   r   enable_syntax_lastmarkr   rB   s    r>   rf   zAnalex.enable_syntax_lastmark   s<     &*"//111//11111r@   c                     d| _         dS )zS
        Enable fully vocalized  input in case of vocalized text analysis.
        TN)r9   rB   s    r>   enable_fully_vocalized_inputz#Analex.enable_fully_vocalized_input	  s     &*"""r@   c                 x    d| _         | j                                         | j                                         dS )zY
        Disable the syntaxic last mark attribute to allow use of I'rab harakat.
        FN)r!   r   disable_syntax_lastmarkr   rB   s    r>   rj   zAnalex.disable_syntax_lastmark  s<     &+"002220022222r@   c                     || _         dS )z
        Set the number of word treated in text.
        @param limit: the word number limit.
        @type limit: integer.
        N)r)   )r<   r)   s     r>   	set_limitzAnalex.set_limit  s     


r@   Nc                     || _         dS )z:
        Use a cache system to qalsadi to be used
        Nr   )r<   cachers     r>   
set_cacherzAnalex.set_cacher!  s     


r@   c                 n    d| _         | j        s&t                              | j                  | _        dS dS )zA
        Allow the analex to use Cache to reduce calcul.
        TN)r6   r   cachemanagerr8   r7   rB   s    r>   enable_allow_cache_usezAnalex.enable_allow_cache_use'  s;      $z 	=%++DO<<DJJJ	= 	=r@   c                     d| _         dS )zE
        Not allow the analex to use Cache to reduce calcul.
        FN)r6   rB   s    r>   disable_allow_cache_usezAnalex.disable_allow_cache_use/  s      %r@   allc           	         |                      |          }dgt          |          z  }| j        rM| j                            |          }t          |          t          |          k    rdgt          |          z  }| j        r=| j                            ||          }t          |          t          |          k    r|}g }|dk    rt          t          t          |d| j
                                               D ]]}||         }|                                  ||         }	|                     ||	          }
d |
D             }|                    |           ^n|dk    rI|d| j
                 D ]8}|                     |          }
d |
D             }|                    |           9nN|dk    rH|d| j
                 D ]8}|                     |          }
d	 |
D             }|                    |           9|S )
a0  
        Analyze text morphologically.
        @param text: the input text.
        @type text: unicode.
        @param mode: the mode of analysis as 'verbs', 'nouns', or 'all'.
        @type mode: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        r   nvru   Nc                 6    g | ]}t          j        |          S r^   r   StemmedWordr_   ws     r>   r`   z%Analex.check_text.<locals>.<listcomp>b  0     ) ) )34K+A..) ) )r@   nounsc                 6    g | ]}t          j        |          S r^   ry   r{   s     r>   r`   z%Analex.check_text.<locals>.<listcomp>j  r}   r@   verbsc                 6    g | ]}t          j        |          S r^   ry   r{   s     r>   r`   z%Analex.check_text.<locals>.<listcomp>r  r}   r@   )rb   lenr   r%   word_taggingr    r'   disambiguate_wordslistranger)   rE   
check_wordrV   check_word_as_nouncheck_word_as_verb)r<   rQ   modera   list_guessed_tagnewwordlistresulted_datairH   
guessedtagone_data_liststemmed_one_data_lists               r>   
check_textzAnalex.check_text5  sh    &&t,,	4#i..0" 	;#{77	BB#$$I66
 %)6C	NN#: $ 	(,??+ K ;3y>>11'	
  5==%Il
l$; < <==>> < < |!!!-a0
 $j A A) )8E) ) )% $$%:;;;;< W__!,DJ,/ < < $ 7 7 = =) )8E) ) )% $$%:;;;;< W__!,DJ,/ < < $ 7 7 = =) )8E) ) )% $$%:;;;;r@   c                 l    t          j        |          rdS |D ]}|dv r dS |t          j        v r dS dS z
        tag words as verbs or nouns according to some features
        Some letters are forbiden in some types like TehMarbuta in verbs
        stopu   إةnonverbr   	stopwordsis_stopr,   TANWINr<   rH   cs      r>   	light_tagzAnalex.light_tagy  Y    
 T"" 	6 	! 	!AF{{ yyEL   yy !rr@   c                 l    t          j        |          rdS |D ]}|dv r dS |t          j        v r dS dS r   r   r   s      r>   
light_tag2zAnalex.light_tag2  r   r@   c                     t          j        |          }|}t          j        |          }t          j        |          }| j        r6| j                            |          r| j                            |          }ng }||                     |          z  }t          j	        |          r3| 
                    |          dk    r||                     |          z  }| j        r| j                            |          s| j                            |          r||                     |          z  }| j                            |          s| j                            |          r||                     |          z  }nb| 
                    |          dk    r||                     |          z  }| 
                    |          dk    r||                     |          z  }t'          |          dk    r||                     |          z  }|                     ||          }|                     ||| j                  }|                     |          }d |D             }| j        r| j                            ||           | j        r|                     ||          }t'          |          dk    r`|                                 }|                    t=          j        |d|||||d|z  dd	d	|                      |d          d	d
                     |S )z
        Analyze one word morphologically as verbs
        @param word: the input word.
        @type word: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        r   r   nonnounr   c                     g | ]	}|j         
S r^   )__dict__r{   s     r>   r`   z%Analex.check_word.<locals>.<listcomp>  s    %H%H%HQaj%H%H%Hr@   r   r   r   r   z%sunknownr   )rH   affixstemoriginallemma	vocalizedsemivocalizedtagstyperoottemplaterJ   syntax)!r,   strip_tatweelstrip_tashkeelstrip_harakatr6   r   is_already_checkedget_checkedcheck_word_as_pounctis_arabicwordr   check_word_as_stopwordr   r%   has_verb_tagis_stopword_tagr   has_noun_tagr   r   check_word_as_unknowncheck_normalizedcheck_shaddar9   add_word_frequencyadd_checkedr2   check_partial_vocalizedrM   rV   r   WordCaserG   )	r<   rH   r   word_vocalisedword_nmword_nm_shaddar   data_list_to_serializer:   s	            r>   r   zAnalex.check_word  sX    "4((&t,,,T22 :	HDJ$A$A'$J$J :	H J227;;MMMT66w???M
 "7++ J>>$''611!T%@%@%I%IIM* J {//"  J44Z@@J &)@)@)I)II{//"  J44Z@@J &)@)@)I)II ~~d++y88%)@)@)I)II~~d++y88%)@)@)I)II=!!Q&& !;!;G!D!DD !11'=IIM !--t/I M
 !33MBBM &I%H-%H%H%H"# H
&&w0FGGG , 	X 88WWM}"",,..J  ! $!1 $$(!%%))- $z 1 ) "$& $dI > >"$    & r@   c                 .    |                      |d          S )z
        Analyze text morphologically as nouns
        @param text: the input text.
        @type text: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        r~   r   rP   s     r>   check_text_as_nounszAnalex.check_text_as_nouns       tW---r@   c                 .    |                      |d          S )z
        Analyze text morphologically as verbs
        @param text: the input text.
        @type text: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        r   r   rP   s     r>   check_text_as_verbszAnalex.check_text_as_verbs  r   r@   c                    t          |          D ]\  }}|j        }|j        }|dk    rd}n|dk    rd}n|dk    rd}nd}|r| j        r<| j                            ||          r!| j                            ||          |_        n@|                     ||          }| j        r| j                            |||           ||_        |||<   |S )uO  
        If the entred word is like the found word in dictionary,
        to treat some normalized cases,
        the analyzer return the vocalized like words
        ُIf the word is ذئب, the normalized form is ذءب, which can give
        from dictionary ذئبـ ذؤب.
        this function filter normalized resulted word according the
        given word, and give ذئب.
        @param resulted_data: the founded resulat from dictionary.
        @type resulted_data: list of dict.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        freqverbverbfreqnounnounfreqstopwordstopwordr   )	enumerater   rJ   r6   r   exists_cache_freqrG   add_freq)r<   r   r   itemr   freqtyperI   rJ   s           r>   r   zAnalex.add_word_frequency  s   & !// (	$ (	$GAt }H
 yH:%%!Z''!^++% % ' %DJ,H,Hh- - % !%
 3 3Hh G GDII  ==8<<D + F
++HhEEE !%DI#M!r@   c                 6    | j                             |          S )z
        Check if the word is a stopword,
        @param word: the input word.
        @type word: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        )r   stemming_stopword)r<   rH   s     r>   r   zAnalex.check_word_as_stopwordL  s     $66t<<<r@   c                 h   g }|s|S |                                 r3|                    t          j        |dd|||dddddd                     |D ]}|t          j        vr nP|                    t          j        |dd|||t          j        |d                  d         ddddd                     |S )	z
        Check if the word is a pounctuation,
        @param word: the input word.
        @type word: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        r   r   u   عددNUMBERr   )rH   r   r   r   r   r   r   r   rJ   r   r   r   POUNCT)	isnumericrV   r   r   r
   POUNCTUATION)r<   rH   detailed_resultchars       r>   r   zAnalex.check_word_as_pounctV  s     	#"">> 	""! $!1 "$( $%) ( ( !"$ "    $  	 	D,999 : ""! $!1 "$(!%%) 1 >tAw G O ( !"$ "    $ r@   c                 6    | j                             |          S )z
        Analyze the word as verb.
        @param verb: the input word.
        @type verb: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        )r   stemming_verb)r<   r   s     r>   r   zAnalex.check_word_as_verb       --d333r@   c                 6    | j                             |          S )z
        Analyze the word as noun.
        @param noun: the input word.
        @type noun: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        )r   stemming_nounr<   r   s     r>   r   zAnalex.check_word_as_noun  r   r@   c                 6    | j                             |          S )z
        Analyze the word as unknown.
        @param noun: the input word.
        @type noun: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        )r   r   r   s     r>   r   zAnalex.check_word_as_unknown  s     "00666r@   c                 @     |r fd|D             S  fd|D             S )a  
        if the entred word is like the found word in dictionary,
        to treat some normalized cases,
        the analyzer return the vocalized like words.
        This function treat the Shadda case.
        @param word_nm_shadda: a word without harakat, but shadda
        @type word_nm_shadda: unicode
        @param resulted_data: the founded resulat from dictionary.
        @type resulted_data: list of dict.
        @param fully_vocalized_input: if the two words must resect the shadda and vocalized.
        @type fully_vocalized_input: Boolean, default is False.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        c                 r    g | ]3}t          j                  t          j        |j                  k    1|4S r^   )r,   r   r   r_   xr   s     r>   r`   z'Analex.check_shadda.<locals>.<listcomp>  sQ       &~66&q{334 4 4 4 4r@   c                 H    g | ]}t          j        |j                  |S r^   )r,   
shaddaliker   r   s     r>   r`   z'Analex.check_shadda.<locals>.<listcomp>  s>       #NAK@@  r@   r^   )r   r   r9   s   `  r>   r   zAnalex.check_shadda  sc      ! 	   &      &   r@   c                      fd|D             S )u  
        If the entred word is like the found word in dictionary,
        to treat some normalized cases,
        the analyzer return the vocalized like words
        ُIf the word is ذئب, the normalized form is ذءب,
        which can give from dictionary ذئبـ ذؤب.
        this function filter normalized resulted word according
        the given word, and give ذئب.
        @param word_nm the input word.
        @type word_nm: unicode.
        @param word_unvocalised: the input word unvocalized.
        @type word_unvocalised: unicode.
        @param resulted_data: the founded resulat from dictionary.
        @type resulted_data: list of dict.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        c                 *    g | ]}|j         k    |S r^   )unvocalized)r_   dr   s     r>   r`   z+Analex.check_normalized.<locals>.<listcomp>  s%    EEEaAMW,D,D,D,D,Dr@   r^   )r<   r   r   s    ` r>   r   zAnalex.check_normalized  s     $ FEEE=EEEEr@   c                 *   g }t          j        |           s|S |D ]}d|v r|d         }d|d         v }t          j        | |          r3|dxx         dt          j        z   z  cc<   |                    |           `|r|                     t           j                  rv|                    t           j                  rWt          j        | dd         |dd                   r2|dxx         dt          j        z   z  cc<   |                    |           |S )a  
        if the entred word is vocalized fully or partially,
        the analyzer return the vocalized like words
        This function treat the partial vocalized case.
        @param word_vocalised: the input word.
        @type word_vocalised: unicode.
        @param resulted_data: the founded resulat from dictionary.
        @type resulted_data: list of dict.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        r   Verbr   r   :NrS   )	r,   is_vocalizedvocalizedliker   PARTIAL_VOCALIZED_TAGrV   endswithKASRASUKUN)r   r   filtred_datar   outputis_verbs         r>   r   zAnalex.check_partial_vocalized  s;    !.11 	6   & 6 6$&&!+.F$V4G*>6BB 6Vl.P(PP$++D1111  6*33EK@@6 #OOEK886
 !.~crc/BF3B3KPP 6 LLLC,2T,TTLLL(//555r@   )FFT)r   )N)ru   )F)$__name__
__module____qualname____doc__r?   rC   rE   rG   rM   rO   r[   rb   rd   rf   rh   rj   rl   ro   rr   rt   r   r   r   r   r   r   r   r   r   r   r   r   staticmethodr   r   r   r^   r@   r>   r   r   B   sB         PTE! E! E! E!N
" 
" 
"     	 	 		
 	
 	
$ $ $ $  @  * * *2 2 2* * *3 3 3     = = =% % %B B B BH    c c c cJ. . .. . .< < <|= = =7 7 7r4 4 44 4 47 7 7    \BF F F8 # # \# # #r@   r   )r  r   syspathrV   r.   pyarabic.arabyr,    arramooz.wordfreqdictionaryclassr3   naftawayh.wordtagr"   arabicstopwords.arabicstopwordsarabicstopwordsr   r   r   r   r   r   r	   r   r
   r   r   r   rq   r   qalsadi.cachemanagerr   r   r^   r@   r>   <module>r     s    zJJJHOOD
 
			        C B B B B B     3 3 3 3 3 3                         ) ) ) ) ) )                               8 8 8 8 8 8M M M M M M M M M Mr@   