
    -iK                        d Z ddlmZmZmZmZ ddlZddlZej        	                    d           ddl
mZ edk    r3ej        	                    d           ddlZddlZddlZddlZddlZddlZn$ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ  G d d          Zedk    rA e            Zg dZeD ]1Ze                    e            ede                                            eej                    ede                                            ede                                            edej                    ee                                            ee                    d                      ede                                             edej!                    ede"                                            ee"                    d                      ee#                                            ede$                                            ede%                                            ede&                                            ede'                                            ee(                    e                      ee)                                            e e*e+                                                     1dS dS )a\  
Arabic Light Stemmer
A class which provides a configurable stemmer
and segmentor for arabic text.

Features:
=========

    - Arabic word Light Stemming.
    - Root Extraction.
    - Word Segmentation
    - Word normalization
    - Default Arabic Affixes list.
    - An customizable Light stemmer: possibility of change
    stemmer options and data.
    - Data independent stemmer


@author: Taha Zerrouki <taha_zerrouki at gmail dot com>
@author: Taha Zerrouki
@contact: taha dot zerrouki at gmail dot com
@copyright: Arabtechies,  Arabeyes,   Taha Zerrouki
@license: GPL
@date:2017/02/15
@version:0.3
    )absolute_importprint_functionunicode_literalsdivisionNz../support/__main__z../   )	normalize)
stem_const)affix_const)roots_const)verb_stamp_const)arabicstopwordsc                      e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd ZdCdZd Zd Z d  Z!d! Z"d" Z#d# Z$dCd$Z%d% Z&dCd&Z'dDd'Z(dDd(Z)dCd)Z*dEd+Z+d, Z,d- Z-dCd.Z.d/ Z/d0 Z0d1 Z1d2 Z2d3 Z3d4 Z4d5 Z5g fd6Z6dFd9Z7dCd:Z8dGd;Z9dGd<Z:e;d=             Z<e;d>             Z=e;d?             Z>d@ Z?e;dA             Z@dBS )HArabicLightStemmera  
    ArabicLightStemmer: a class which proved a configurable stemmer
    and segmentor for arabic text.

    Features:
    =========

        - Arabic word Light Stemming.
        - Root Extraction.
        - Word Segmentation
        - Word normalization
        - Default Arabic Affixes list.
        - An customizable Light stemmer: possibility of change
        stemmer options and data.
        - Data independent stemmer


    @author: Taha Zerrouki <taha_zerrouki at gmail dot com>
    @author: Taha Zerrouki
    @contact: taha dot zerrouki at gmail dot com
    @copyright: Arabtechies,  Arabeyes,   Taha Zerrouki
    @license: GPL
    @date:2017/02/15
    @version:0.3
    c                    t           j        | _        t           j        | _        t           j        | _        t           j        | _        t           j	        | _
        t           j        | _        t           j        | _        t           j        | _        t           j        | _        t&          j        | _        t-          t/          t0          j                  t/          t0          j                  z             | _        d| _        d| _        d| _        d| _        d| _         d| _!        d| _"        g | _#        tI          j%        dtH          j&                  | _'        | (                    | j                  | _)        | *                    | j                  | _+        d S )N r   u   [^\wً-ْ']+),r
   DEFAULT_PREFIX_LETTERSprefix_lettersDEFAULT_SUFFIX_LETTERSsuffix_lettersDEFAULT_INFIX_LETTERSinfix_lettersDEFAULT_MAX_PREFIXmax_prefix_lengthDEFAULT_MAX_SUFFIXmax_suffix_lengthDEFAULT_MIN_STEMmin_stem_lengthDEFAULT_JOKERjokerDEFAULT_PREFIX_LISTprefix_listDEFAULT_SUFFIX_LISTsuffix_listr   ROOTS	root_listsetlistr   VERB_AFFIX_LISTNOUN_AFFIX_LISTvalid_affixes_listwordunvocalized
normalizedstarwordrootleftrightsegment_listrecompileUNICODE	token_pat_create_prefix_treeprefixes_tree_create_suffix_treesuffixes_treeselfs    Q/var/www/html/speakWrite/venv/lib/python3.11/site-packages/tashaphyne/stemming.py__init__zArabicLightStemmer.__init__O   s'    )?(?'=!+!>!+!>):-
%9%9$* #&d;+F&G&G${OjJkJk&k"l"l			
 $;RZHH!55d6FGG!55d6FGG    c                     | j         S )z return the prefixation letters.
        This constant take DEFAULT_PREFIX_LETTERS by default.
        @return: return a letters.
        @rtype: unicode.
        r   r<   s    r>   get_prefix_lettersz%ArabicLightStemmer.get_prefix_lettersp        ""r@   c                     || _         dS )u    set the prefixation letters.
        This constant take DEFAULT_PREFIX_LETTERS by default.
        @param new_prefix_letters: letters to be striped from a word,
        e.g.new_prefix_letters = u"وف":.
        @type new_prefix_letters: unicode.
        NrB   )r=   new_prefix_letterss     r>   set_prefix_lettersz%ArabicLightStemmer.set_prefix_lettersx        1r@   c                     | j         S )z return the suffixation letters.
        This constant take DEFAULT_SUFFIX_LETTERS by default.
        @return: return a letters.
        @rtype: unicode.
        r   r<   s    r>   get_suffix_lettersz%ArabicLightStemmer.get_suffix_letters   rD   r@   c                     || _         dS )u   set the suffixation letters.
        This constant take DEFAULT_SUFFIX_LETTERS by default.
        @param new_suffix_letters: letters to be striped from the end of a word,
        e.g.new_suffix_letters = u"ةون":.
        @type new_suffix_letters: unicode.
        NrJ   )r=   new_suffix_letterss     r>   set_suffix_lettersz%ArabicLightStemmer.set_suffix_letters   rH   r@   c                     | j         S )z get the inffixation letters.
        This constant take DEFAULT_INFIX_LETTERS by default.
        @return: infixes letters.
        @rtype: unicode.
        r   r<   s    r>   get_infix_lettersz$ArabicLightStemmer.get_infix_letters   s     !!r@   c                     || _         dS )u   set the inffixation letters.
        This constant take DEFAULT_INFIX_LETTERS by default.
        @param new_infix_letters: letters to be striped from the middle
        of a word, e.g.new_infix_letters = u"أوي":.
        @type new_infix_letters: unicode.
        NrP   )r=   new_infix_letterss     r>   set_infix_lettersz$ArabicLightStemmer.set_infix_letters   s     /r@   c                     | j         S )z get the joker letter.
        This constant take DEFAULT_JOKER by default.
        @return: joker letter.
        @rtype: unicode.
        )r    r<   s    r>   	get_jokerzArabicLightStemmer.get_joker   s     zr@   c                 J    t          |          dk    r|d         }|| _        dS )z set the joker letter.
        This constant take DEFAULT_JOKER by default.
        @param new_joker: joker letter.
        @type new_joker: unicode.
        r   r   N)lenr    )r=   	new_jokers     r>   	set_jokerzArabicLightStemmer.set_joker   s)     y>>A!!I


r@   c                     | j         S )z return the constant of max length of the prefix used by the stemmer.
        This constant take DEFAULT_MAX_PREFIX_LENGTH by default.
        @return: return a number.
        @rtype: integer.
        r   r<   s    r>   get_max_prefix_lengthz(ArabicLightStemmer.get_max_prefix_length        %%r@   c                     || _         dS )a   Set the constant of max length of the prefix used by the stemmer.
        This constant take DEFAULT_MAX_PREFIX_LENGTH by default.
        @param new_max_prefix_length: the new max prefix length constant.
        @type new_max_prefix_length: integer.
        Nr\   )r=   new_max_prefix_lengths     r>   set_max_prefix_lengthz(ArabicLightStemmer.set_max_prefix_length        "7r@   c                     | j         S )z return the constant of max length of the suffix used by the stemmer.
        This constant take DEFAULT_MAX_SUFFIX_LENGTH by default.
        @return: return a number.
        @rtype: integer.
        r   r<   s    r>   get_max_suffix_lengthz(ArabicLightStemmer.get_max_suffix_length   r^   r@   c                     || _         dS )a   Set the constant of max length of the suffix used by the stemmer.
        This constant take DEFAULT_MAX_SUFFIX_LENGTH by default.
        @param new_max_suffix_length: the new max suffix length constant.
        @type new_max_suffix_length: integer.
        Nrd   )r=   new_max_suffix_lengths     r>   set_max_suffix_lengthz(ArabicLightStemmer.set_max_suffix_length   rb   r@   c                     | j         S )z return the constant of min length of the stem used by the stemmer.
        This constant take DEFAULT_MIN_STEM_LENGTH by default.
        @return: return a number.
        @rtype: integer.
        r   r<   s    r>   get_min_stem_lengthz&ArabicLightStemmer.get_min_stem_length   s     ##r@   c                     || _         dS )z Set the constant of min length of the stem used by the stemmer.
        This constant take DEFAULT_MIN_STEM_LENGTH by default.
        @param new_min_stem_length: the min stem length constant.
        @type new_min_stem_length: integer.
        Nrj   )r=   new_min_stem_lengths     r>   set_min_stem_lengthz&ArabicLightStemmer.set_min_stem_length   s      3r@   c                     | j         S )z return the prefixes list used by the stemmer.
        This constant take DEFAULT_PREFIX_LIST by default.
        @return: prefixes list.
        @rtype: set().
        )r"   r<   s    r>   get_prefix_listz"ArabicLightStemmer.get_prefix_list        r@   c                 H    || _         |                     | j                    dS )z Set  prefixes list used by the stemmer.
        This constant take DEFAULT_PREFIX_LIST by default.
        @param new_prefix_list: a set of prefixes.
        @type new_prefix_list: set of unicode string.
        N)r"   r8   )r=   new_prefix_lists     r>   set_prefix_listz"ArabicLightStemmer.set_prefix_list   )     +  !122222r@   c                     | j         S )z return the suffixes list used by the stemmer.
        This constant take DEFAULT_SUFFIX_LIST by default.
        @return: suffixes list.
        @rtype: set().
        )r$   r<   s    r>   get_suffix_listz"ArabicLightStemmer.get_suffix_list   rq   r@   c                 H    || _         |                     | j                    dS )z Set  suffixes list used by the stemmer.
        This constant take DEFAULT_SUFFIX_LIST by default.
        @param new_suffix_list: a set of suffixes.
        @type new_suffix_list: set of unicode string.
        N)r$   r:   )r=   new_suffix_lists     r>   set_suffix_listz"ArabicLightStemmer.set_suffix_list   ru   r@   c                     | j         S )z return the roots list used by the stemmer to validate roots.
        This constant take roots_const.ROOTS by default.
        @return: roots list.
        @rtype: set().
        
roots_listr<   s    r>   get_roots_listz!ArabicLightStemmer.get_roots_list  s     r@   c                     || _         dS )z Set  roots list used by the stemmer to validate roots..
        This constant take roots_const.ROOTS by default.
        @param new_roots_list: a set of roots.
        @type new_roots_list: set of unicode string.
        Nr|   )r=   new_roots_lists     r>   set_roots_listz!ArabicLightStemmer.set_roots_list  s     )r@   c                     | j         S )z return the valid_affixes list used by the stemmer to validate affixes.
        This constant take valid_affixes_const.ROOTS by default.
        @return: valid_affixes list.
        @rtype: set().
        r+   r<   s    r>   get_valid_affixes_listz)ArabicLightStemmer.get_valid_affixes_list  s     &&r@   c                     || _         dS )a   Set  valid_affixes list used by the stemmer to validate affixes..
        This constant take valid_affixes_const.ROOTS by default.
        @param new_valid_affixes_list: a set of valid_affixes.
        @type new_valid_affixes_list: set of unicode string.
        Nr   )r=   new_valid_affixes_lists     r>   set_valid_affixes_listz)ArabicLightStemmer.set_valid_affixes_list  s     #9r@   c                     || _         dS )zw Set the word to treat by the stemmer.
        @param new_word: the new word.
        @type new_word: unicode.
        Nr,   )r=   new_words     r>   set_wordzArabicLightStemmer.set_word#  s    
 			r@   c                     | j         S )zf return the last word treated by the stemmer.
        @return: word.
        @rtype: unicode.
        r   r<   s    r>   get_wordzArabicLightStemmer.get_word*  s    
 yr@   c                     | j         S )u   return the starlike word treated by the stemmer.
        All non affix letters are converted to a joker.
        The joker take by default DEFAULT_JOKER = "*".

        Exmaple:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتصربونني'
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_starword()
            أفت***ونني

        @return: word.
        @rtype: unicode.
        )r/   r<   s    r>   get_starwordzArabicLightStemmer.get_starword4  s     }r@   c                     |dk    s|dk    r|                      ||           n|                                 | _        | j        S )u   return the root of the treated word by the stemmer.
        All non affix letters are converted to a joker.
        All letters in the joker places are part of root.
        The joker take by default DEFAULT_JOKER = "*".

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتصربونني'
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_starword()
            أفت***ونني
            >>> print ArListem.get_root()
            ضرب

        @param prefix_index: indicate the left stemming position
        if = -1: not cosidered, and take the default word prefix lentgh.
        @type prefix_index:integer.
        @param suffix_index:indicate the right stemming position.
        if = -1: not cosidered, and take the default word suffix position.
        @type suffix_index: integer.
        @return: root.
        @rtype: unicode.
        r   )extract_root_choose_rootr0   r=   prefix_indexsuffix_indexs      r>   get_rootzArabicLightStemmer.get_rootE  sM    2 1 1 1lL9999))++DIyr@   c                    t          j        | j                  rt          j        | j                  S | j        s|                     | j                   |                                 }d |D             }|}t          t          | j	        |                    }|r|}t          t          | j
        |                    }|r|}|                     |          }|S )z" choose a root for the given word c                     g | ]
}|d          S r0    ).0ds     r>   
<listcomp>z3ArabicLightStemmer._choose_root.<locals>.<listcomp>l  s    ///q6///r@   )r   is_stopr,   	stop_rootr3   segmentget_affix_listr(   filteris_root_length_validis_rootmost_common)r=   
affix_listroots	roots_tmpacceptedaccepted_roots         r>   r   zArabicLightStemmer._choose_rootd  s    "49-- 	8",TY777  	$LL###((**
//J///	t8)DDEE 	 9t|Y77@@ 	 9((33r@   c                 T    t          j         j                  rt          j         j                  S  j        s                      j                    j        } fd|D             }|sd}t           j                  }n                     |          \  }} j        ||         S )z" choose a stem for the given word c                 J    g | ]\  }}                     ||          ||f S r   )_verify_affix)r   xyr=   s      r>   r   z3ArabicLightStemmer._choose_stem.<locals>.<listcomp>  s6    KKKeq43E3Ea3J3JKQqEKKKr@   r   )	r   r   r,   	stop_stemr3   r   rX   get_left_rightr-   )r=   seg_listr1   r2   s   `   r>   _choose_stemzArabicLightStemmer._choose_stem{  s     "49-- 	8",TY777  	$LL###$KKKKxKKK  	8D	NNEE--h77KD%U
++r@   c                     | j         S )u   return the normalized form of the treated word by the stemmer.
        Some letters are converted into normal form like Hamzat.

        Example:
            >>> word = u"استؤجرُ"
            >>> ArListem = ArabicLightStemmer()
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_normalized()
            استءجر

        @return: normalized word.
        @rtype: unicode.
        )r.   r<   s    r>   get_normalizedz!ArabicLightStemmer.get_normalized  s     r@   c                     | j         S )u   return the unvocalized form of the treated word by the stemmer.
        Harakat are striped.

        Example:
            >>> word = u"الْعَرَبِيّةُ"
            >>> ArListem = ArabicLightStemmer()
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_unvocalized()
            العربية

        @return: unvocalized word.
        @rtype: unicode.
        )r-   r<   s    r>   get_unvocalizedz"ArabicLightStemmer.get_unvocalized  s     r@   c                     | j         S )u   return the the left position of stemming
        (prefixe end position )in the word treated word by the stemmer.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتصربونني'
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_starword()
            أفت***ونني
            >>> print ArListem.get_left()
            3

        @return: the left position of stemming.
        @rtype: integer.
        )r1   r<   s    r>   get_leftzArabicLightStemmer.get_left  s      yr@   c                     | j         S )u   return the the right position of stemming
        (suffixe start position )in the word treated word by the stemmer.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتصربونني'
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_starword()
            أفت***ونني
            >>> print ArListem.get_right()
            6

        @return: the right position of stemming.
        @rtype: integer.
        )r2   r<   s    r>   	get_rightzArabicLightStemmer.get_right  s    " zr@   c                     |dk    s|dk    r/|dk     r| j         }n|}|dk     r| j        }n|}| j        ||         S |                                 }|S )u   return the stem of the treated word by the stemmer.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتكاتبانني'
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_stem()
            كاتب

        @param prefix_index: indicate the left stemming position
        if = -1: not cosidered, and take the default word prefix lentgh.
        @type prefix_index:integer.
        @param suffix_index:indicate the right stemming position.
        if = -1: not cosidered, and take the default word suffix position.
        @type suffix_index: integer.
        @return: stem.
        @rtype: unicode.
        r   )	stem_left
stem_rightr-   r   )r=   r   r   r1   r2   stems         r>   get_stemzArabicLightStemmer.get_stem  sw    , 1 1 1a~ $a %#DJ//$$&&Dr@   c                 f   |}|                     t          j        d          }t          |          dk    rCt	          j        dt          j        t          j        t          j        d| j	        |          }|S |dd         t	          j        t          j        | j	        |dd                   z   }| j
        ||                             d          r9|dd         t	          j        t          j        | j	        |dd                   z   }n%t	          j        t          j        | j	        |          }| j
        ||                             d          r9|dd         t	          j        t          j        | j	        |dd                   z   }n%t	          j        t          j        | j	        |          }|S )	zh
        Handle case of Teh as infix.
        The Teh can be Dal after Zain, and Tah after Dhad
        r      []N   u   ضطu   زد)replacearabyTEH_MARBUTArX   r4   subTEHTAHDALr    r,   
startswith)r=   r/   r1   r2   newstarstemkey_stems         r>   _handle_teh_infixz$ArabicLightStemmer._handle_teh_infix  sr   
 &&u'8<<x==A&&eiiEIII!NPTPZ\ghhK ""1"obfUY
KPQPRPRO&T&TT9T%Z ++G44 	E%bqb/"&DJTUTVTV*X*XXKK&DJDDKId5j!,,W55 	E%bqb/"&DJTUTVTV*X*XXKK&DJDDKr@   c                    | j         }|dk     r|dk     r|| j        | j                 S | j        }| j        }|dk    r|}|dk    r|}| j        dk    rPt	          j        d| j        t          j        d| j        |||                   }| 	                    |||          }n| j        t          |||                   z  }|S )u{   return the star form stem of the treated word by the stemmer.
        All non affix letters are converted to a joker.
        The joker take by default DEFAULT_JOKER = "*".

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتكاتبانني'
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_stem()
            كاتب
            >>> print ArListem.get_starstem()
            *ات*

        @param prefix_index: indicate the left stemming position
        if = -1: not cosidered, and take the default word prefix lentgh.
        @type prefix_index:integer.
        @param suffix_index:indicate the right stemming position.
        if = -1: not cosidered, and take the default word suffix position.
        @type suffix_index: integer.
        @return: stared form of stem.
        @rtype: unicode.
        r   r   [^r   )r,   r1   r2   r   r4   r   r   r   r    r   rX   )r=   r   r   r/   r1   r2   r   s          r>   get_starstemzArabicLightStemmer.get_starstem  s    0 9!q 0 0DIdj0119DJEq  #q  $!R'' ff1C1CUEVEVEV%W:xU
35 5 #44[$NN"jXd5j-A)B)BBr@   c                 T    |dk     r| j         d| j                 S | j         d|         S )u   return the prefix of the treated word by the stemmer.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتكاتبانني'
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_prefix()
            أفت

        @param prefix_index: indicate the left stemming position
        if = -1: not cosidered, and take the default word prefix lentgh.
        @type prefix_index:integer.
        @return:  prefixe.
        @rtype: unicode.
        r   N)r-   r1   )r=   r   s     r>   
get_prefixzArabicLightStemmer.get_prefixD  s5      !#JTYJ//#M\M22r@   c                 T    |dk     r| j         | j        d         S | j         |d         S )u   return the suffix of the treated word by the stemmer.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتكاتبانني'
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_suffix()
            انني

        @param suffix_index:indicate the right stemming position.
        if = -1: not cosidered, and take the default word suffix position.
        @type suffix_index: integer.
        @return:  suffixe.
        @rtype: unicode.
        r   N)r-   r2   )r=   r   s     r>   
get_suffixzArabicLightStemmer.get_suffixZ  s5      !#DJKK00#LMM22r@   c                 |    d                     |                     |          |                     |          g          S )u   return the affix of the treated word by the stemmer.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتكاتبانني'
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_affix()
            أفت-انني

        @param prefix_index: indicate the left stemming position
            if = -1: not cosidered, and take the default word prefix lentgh.
        @type prefix_index:integer.
        @param suffix_index:indicate the right stemming position.
            if = -1: not cosidered, and take the default word suffix position.
        @type suffix_index: in4teger.
        @return:  suffixe.
        @rtype: unicode.
        -)joinr   r   r   s      r>   	get_affixzArabicLightStemmer.get_affixo  s<    & yy$//,77%%' ( ( 	(r@   r   c                     |                      |          |                     |          |                     ||          |                     ||          |                     ||          dS )u   return the affix tuple of the treated word by the stemmer.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتضاربانني'
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_affix_tuple()
            {'prefix': u'أفت', 'root': u'ضرب', 'suffix': u'انني', 'stem': u'ضارب'}

        @param prefix_index: indicate the left stemming position
            if = -1: not cosidered, and take the default word prefix lentgh.
        @type prefix_index:integer.
        @param suffix_index:indicate the right stemming position.
            if = -1: not cosidered, and take the default word suffix position.
        @type suffix_index: integer.
        @return: affix tuple.
        @rtype: dict.
        )prefixsuffixr   starstemr0   )r   r   r   r   r   r   s      r>   get_affix_tuplez"ArabicLightStemmer.get_affix_tuple  sf    ( __\22__\22==|<<((|DD==|<<
 
 	
r@   c                     |dk    rdS |                      |           |                     |           |                                 S )uJ  
        Stemming function, stem an arabic word, and return a stem.
        This function store in the instance the stemming positions
        (left, right), then it's possible to get other calculted
        attributs like: stem, prefixe, suffixe, root.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتضاربانني'
            >>> stem = ArListem.light_stem(word)
            >>> print ArListem.get_stem()
            ضارب
            >>> print ArListem.get_starstem()
            *ا**
            >>> print ArListem.get_left()
            3
            >>> print ArListem.get_right()
            6
            >>> print ArListem.get_root()
            ضرب

        @param word: the input word.
        @type word: unicode.
        @return: stem.
        @rtype: unicode.
        r   )transform2starsr   r   r=   r,   s     r>   
light_stemzArabicLightStemmer.light_stem  sH    6 3;;3T"""T }}r@   c                 @   || _         t          j        |          }|| _        t	          j        dt          j        z  t          j        t          j        z   |          }t	          j        d| j	        | j
        d| j        |          }|                    | j                  }|                    | j                  }|dk    rt          || j        dz
            }t!          |dz   t#          |          | j        z
            }|d|         }| j         ||         }||d         }t	          j        d| j	        z  | j        |          }| j        r#t	          j        d| j        z  | j        |          }t	          j        d| j
        z  | j        |          }||z   |z   }|                    | j                  }|                    | j                  }|dk     r%t          | j        t#          |          dz
            }|dk    r|d|         }|d	k    r"|| j        vr|dd
         }|d	k    r	|| j        v|dk     r2t!          t#          |          t#          |          | j        z
            }||d         }|r|| j        vr|dd         }|r	|| j        vt#          |          }t#          |          t#          |          z
  }| j         ||         }| j        r#t	          j        d| j        z  | j        |          }||z   |z   }|| _        || _        || _        |||fS )ut  
        Transform all non affixation letters into a star.
        the star is a joker(by default '*').
        which indicates that the correspandent letter is an original.
        this function is used by the stmmer to identify original letters.
        and return a stared form and stemming positions (left, right)

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتضاربانني'
            >>> starword, left, right = ArListem.transformToStrars(word)
            (أفت*ا**انني, 3, 6)

        @param word: the input word.
        @type word: unicode
        @return: (starword, left, right):
            - starword: all original letters converted into a star
            - left: the greater possible left stemming position.
            - right: the greater possible right stemming position.
        @rtype: tuple.
        [%s]r   r   r   r   Nz[^%s]r   r   r   )r,   r   strip_tashkeelr-   r4   r   
ALEF_MADDAHAMZAALEFr   r   r    findrfindminr   maxrX   r   r   r"   r$   r   r   r/   )r=   r,   r1   r2   r   r   r   s          r>   r   z"ArabicLightStemmer.transform2stars  s   , 	#D))vgu/0%+ej2H$OOvv$"5"5t7J7J7JK	T  yy$$

4:&&199tT3A566DaT4+A!ABBE%4%[F9T%Z(D%&&\FVHT%88$*fMMF! Mvht'994:tLLVHT%88$*fMMF$;v%Dyy$$

4:&& !88t-s4yy{;;D199%4%[FB,,61A#A#A B,,61A#A#AqyyCKKT43I)IJJ%&&\F $V4+;;;  $V4+;;;v;;DIIc&kk)E9T%Z(D ! Mvht'994:tLL$;v%D dE""r@   c                    |                      ||          }d}t          |          dk    r"|                     ||          | _        | j        S |                     ||          }d}t          |          t          |          k    r,t          |          D ]\  }}||         | j        k    r||z  }n|}|                     |          }t          |          dk    r|                     ||          }|| _        |S )u   return the root of the treated word by the stemmer.
        All non affix letters are converted to a joker.
        All letters in the joker places are part of root.
        The joker take by default DEFAULT_JOKER = "*".

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'أفتصربونني'
            >>> stem = ArListem .light_stem(word)
            >>> print ArListem.get_starword()
            أفت***ونني
            >>> print ArListem.get_root()
            ضرب

        @param prefix_index: indicate the left stemming position
            if = -1: not cosidered, and take the default
            word prefix lentgh.
        @type prefix_index:integer.
        @param suffix_index:indicate the right stemming position.
            if = -1: not cosidered, and take the default word suffix position.
        @type suffix_index: integer.
        @return: root.
        @rtype: unicode.
        r      r   )r   rX   _ajust_rootr0   r   	enumerater    normalize_root)r=   r   r   r   r0   r   ichars           r>   r   zArabicLightStemmer.extract_root  s    2 }}\<88t99>>((t44DI9$$\<@@x==CII%%$T?? ! !4A;$*,,DLD! D""4(( t99>>##D(33D	r@   c                    |s|S t          |          dk    rV|                    t          j        t          j                  }|                    t          j        t          j                  }|S |d         }|dd         }|t          j        t          j        fv rt          j        |z   }n|t          j        k    rt          j        |z   }n|| j        k    r*|t          j        t          j        fv r|t          j        z  }n|| j        k    r*|t          j        t          j        fv r|t          j        z  }nS|| j        k    rH|| j        k    r=t          |          dk    r||d         z  }n|d         t          j        z   |d         z   }|S )z]
        If the root has only three or two letters, we complete it by another letter
        r   r   r   Nr   r   )rX   r   r   r   WAWALEF_MAKSURAYEHr    )r=   r0   r   firstlasts        r>   r   zArabicLightStemmer._ajust_rootL  sf     	Kx==A''
EI>>H''(:EIFFHO }UZ+++9t#DDei9t#DDdj  Tej%)-D%D%DEIDDdj  Te.@%)-L%L%LEIDDdj  TTZ%7%78}}!!R  Aw	)472r@   c                     i }|D ]1}|}|D ]}||vri ||<   ||         }d|v rd|d         |<   *|di|d<   2|| _         | j         S )z
        Create a prefixes tree from given prefixes list
        @param prefixes: list of prefixes
        @type prefixes: list of unicode
        @return: prefixes tree
        @rtype: Tree stucture
        #)r9   )r=   prefixesprefixestreer   branchr   s         r>   r8   z&ArabicLightStemmer._create_prefix_treel  s      	- 	-F!F & &v%%#%F4Lf}}&)sF##%cns)!!r@   c                     i }|D ]:}|}|ddd         D ]}||vri ||<   ||         }d|v rd|d         |<   3|di|d<   ;|| _         | j         S )z
        Create a suffixes tree from given suffixes list
        @param suffixes: list of suffixes
        @type suffixes: list of unicode
        @return: suffixes tree
        @rtype: Tree stucture
        Nr   r   )r;   )r=   suffixessuffixestreer   r  r   s         r>   r:   z&ArabicLightStemmer._create_suffix_tree  s      	- 	-F!Fttt & &v%%#%F4Lf}}&)sF##%cns)!!r@   c                 Z   | j         }dg}d}|t          |          k     r_||         |v rUd|v r|                    |           ||         |v r|||                  }nn"|dz  }|t          |          k     r
||         |v U|t          |          k     rd|v r|                    |           |S )z
        lookup for prefixes in the word
        @param word: the given word
        @type word: unicode
        @return: list of prefixes starts positions
        @rtype: list of int
        r   r   r   )r9   rX   append)r=   r,   r  leftsr   s        r>   lookup_prefixesz"ArabicLightStemmer.lookup_prefixes  s     ##d))mmQ6 1 1f}}QAw&  Q FA #d))mmQ6 1 1 s4yy==SF]]LLOOOr@   c                 P   | j         }d}g }t          |          dz
  }|dk    r`||         |v rV||         |z   }d|v r|                    |dz              ||         |v r|||                  }nn|dz  }|dk    r
||         |v V|dk    rd|v r|                    |dz              |S )z
        lookup for suffixes in the word
        @param word: the given word
        @type word: unicode
        @return: list of suffixes starts positions
        @rtype: list of int
        r   r   r   r   )r;   rX   r  )r=   r,   r  r   rightsr   s         r>   lookup_suffixesz"ArabicLightStemmer.lookup_suffixes  s     #IIaK1ffaF**!WV^Ff}} ac"""Aw&  Q FA 1ffaF** 66cVmmMM!A#r@   c                    || _         t          j        |          | _        t	          j        dt          j        z  t          j        t          j        z   |          }| 	                    |          }| 
                    |          }|rt          |          | _        nd| _        |rt          |          | _        nd| _        t          dt!          |          fg          | _        |D ],}|D ]'}||dz   k    r| j                            ||f           (-|                     | j                  \  | _        | _        | j        S )u   generate  a list of  all possible segmentation positions
        (lef, right)  of the treated word by the stemmer.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'فتضربين'
            >>> print ArListem.segment(word)
            set(([(1, 5), (2, 5), (0, 7)])

        @return: List of segmentation
        @rtype: set of tuple of integer.
        r   r   r   r   )r,   r   r   r-   r4   r   r   r   r   r  r  r   r1   r   r2   r'   rX   r3   addr   )r=   r,   r  r
  r   js         r>   r   zArabicLightStemmer.segment  s9    	 /55vgu/0%+ej2H$OO $$T**%%d++ 	E

DIIDI 	VDJJDJ!SYY 011 	2 	2A 2 2!88%))1a&1112
 !% 3 3D4E F F	4:  r@   c                     | j         S )u   return   a list of segmentation positions (left, right)
        of the treated word by the stemmer.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'فتضربين'
            >>> ArListem.segment(word)
            >>> print ArListem.get_segment_list()
            set(([(1, 5), (2, 5), (0, 7)])

        @return: List of segmentation
        @rtype: set of tuple of integer.
        )r3   r<   s    r>   get_segment_listz#ArabicLightStemmer.get_segment_list  s       r@   c                 ~    |s| j         }g }|D ].\  }}|                    |                     ||                     /|S )ug   return   a list of affix tuple of the treated word by the stemmer.

        Example:
            >>> ArListem = ArabicLightStemmer()
            >>> word = u'فتضربين'
            >>> ArListem.segment(word)
            >>> print ArListem.get_affix_list()
            [{'prefix': u'ف', 'root': u'ضرب', 'suffix': u'ين', 'stem': u'تضرب'},
            {'prefix': u'فت', 'root': u'ضرب', 'suffix': u'ين', 'stem': u'ضرب'},
            {'prefix': u'', 'root': u'فضربن', 'suffix': u'', 'stem': u'فتضربين'}]

        @return: List of Affixes tuple
        @rtype: list of dict.
        )r3   r  r   )r=   r   r   r1   r2   s        r>   r   z!ArabicLightStemmer.get_affix_list  s[      	)(H
# 	A 	AZT%d224??@@@@r@   nounr   c                     |sdS |dk    rUt          |          dk    st          |          dk     rdS t          j        |v rdS t          |          dk    r!|                    t          j                  sdS t          |          dk    r[|d         t          j        t          j        fvr;|dd         t          j        t          j        t          j        t          j        fv rdS nd|                    t          j                  rE|dd         t          j        t          j        t          j        t          j        t          j        fv rdS t          j
        |          sdS n|d	k    rt          |          d
k    rdS dS dS )z Test if the stem is acceptedFverb   r      r   r   Nr     T)rX   r   r   r   r   r   r   NOONALEF_HAMZA_ABOVEr   is_verb_stamp)r=   r   tagr   s       r>   _valid_stemzArabicLightStemmer._valid_stem*  sk    	5&==4yy1}}D		Au"d**uTa
(C(Cu TaQEJ	3J(J(J"##;59eiUE["\\\ 5 ] ,, &+%)UZY^Ybdidz  }B  }G  BH  3H  3Hu$2488 uF]]4yyA~~u4tr@   c                    |                      |          }|                     |          }d}|r|dz   |z   }|                     ||          }|t          j        v r?|                     |d|          r(|t          j        v r|                     |d          rdS dS |t          j        v r|                     |d          rdS dS dS )zB
        validate affixes against a list of valid affixes
        Tr   r  r  F)r   r   r   r   r)   r  r*   r+   )r=   r   r   r   r   TAGaffixr   s           r>   r   z ArabicLightStemmer._verify_affixO  s     .... 	!3Jv%E]]<>>D3338H8HfV\8]8]3K777D<L<LTRX<Y<Y744K777D<L<LTRX<Y<Y74 5tr@   c                     |dk    r| j         dk    rdS |dk    r|| _         n| j         }t          j        |          | _        | j        S )a  
        Normalize a word.
        Convert some leters forms into unified form.
        @param word: the input word, if word is empty,
        the word member of the class is normalized.
        @type word: unicode.
        @return: normalized word.
        @rtype: unicode.
        r   )r,   r	   normalize_searchtextr.   r   s     r>   r	   zArabicLightStemmer.normalize  sQ     3;;49++3S[[DII9D#8>>r@   c                 t    |sg S | j                             |          }d|v r|                    d           |S )z
        Tokenize text into words
        @param text: the input text.
        @type text: unicode.
        @return: list of words.
        @rtype: list.
        r   )r7   splitremove)r=   textmylists      r>   tokenizezArabicLightStemmer.tokenize  sF      	I^))$//Ff}}c"""Mr@   c                 ,   |                      t          j        t          j        t          j        z             } |                      t          j        d          } |                      t          j        t          j                  } t          j        |           S ) test if word is a rootr   )	r   r   r   r   r   r   r   r   normalize_hamzar   s    r>   r   z!ArabicLightStemmer.normalize_root  sb     ||E,ek5:.EFF||E-r22||E.	::$T***r@   c                 N    t          |           dk    ot          |           dk    S )Nr   r   rX   r   s    r>   r   z'ArabicLightStemmer.is_root_length_valid  s#     D		Q/3t99a<0r@   c                     d | D             }|r|} t          t          t          |                     | j                  S )Nc                 8    g | ]}t          |          d k    |S )r   r,  )r   r   s     r>   r   z2ArabicLightStemmer.most_common.<locals>.<listcomp>  s#    222!c!ffkkAkkkr@   )key)r   r'   sortedcount)lsttrirootss     r>   r   zArabicLightStemmer.most_common  sE    22s222 	C3vc{{##3333r@   c                     || j         v S )r)  )r&   r   s     r>   r   zArabicLightStemmer.is_root  s    t~%%r@   c                 n    | sdS t          |           \  }t          fd| D                       }|fS )z4
        get the max left and the min right
        r   r   c                 &    g | ]\  }}|k    |S r   r   )r   r   r   ls      r>   r   z5ArabicLightStemmer.get_left_right.<locals>.<listcomp>  s"    +++u!adddddr@   )r   r   )ls_rr8  s      @r>   r   z!ArabicLightStemmer.get_left_right  sL    
  	5WW!+++++++,,!tr@   Nr6  )r   )r   r   )r  r   )r   )A__name__
__module____qualname____doc__r?   rC   rG   rK   rN   rQ   rT   rV   rZ   r]   ra   re   rh   rk   rn   rp   rt   rw   rz   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r8   r:   r  r  r   r  r   r  r   r	   r'  staticmethodr   r   r   r   r   r   r@   r>   r   r   5   s)        2H H HB# # #1 1 1# # #1 1 1" " "/ / /    & & &7 7 7& & &7 7 7$ $ $3 3 3     3 3 3     3 3 3  ) ) )' ' '9 9 9      "   >  ., , ,,           $  &$ $ $ $L  6+ + + +Z3 3 3 3,3 3 3 3*( ( ( (,
 
 
 
:# # #JR# R# R#h/ / / /b  @" " "." " "2  4  @(! (! (!\! ! !" ')    *" " " "J7 7 7 7x   (     + + \+ 1 1 \1 4 4 \4& & &   \  r@   r   )u   أفتضاربانني   بالمكتبةu   مزدهرةu   كاتبu
   مضروبu
   مضاربu
   مردودu
   مطلوبu   مشتتu   مزتهرةu
   مضطربrA  u   مالبدرسمهu
   مكتوبu   الآجالu   بالبلدانu   وفيهما1245Taha@r   zroot:r1   z	left stemr   r2   
right_stemr   
   r/   r   r	   r-   ),r?  
__future__r   r   r   r   r4   syspathr  pyarabic.arabyr   r<  r	   r
   r   r   r   r   r   r   ARLISTEMwordlistr,   r   printr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  reprr   r   r@   r>   <module>rO     s   4            


 



          zHOOE""""""!!!!!![ [ [ [ [ [ [ [|$ z!!##H  H*  */ */D!!!fh''))***h$%%%gx((**+++ 	fX&&(()))k(,---h!!##$$$h!!!$$%%% 	gh((**+++l8.///h++--...h!!"%%&&&h&&(())) 	j(//11222j..00111 	k82244555mH4466777 	ht$$%%%h''))***dd8**,,--....E 0*/ */r@   