
    -iVW                     z    d Z ddlmZ ddlmZ ddlmZ ddlZ G d d          Z	d Z
edk    r e
             dS dS )z 
    Arabic _collocations Class
    Nc                   p    e Zd ZdZddZd Zd Zd Zd Zd Z	d	 Z
d
 Zd Zd Zd Zd Zd Zd ZddZdS )CollocationClassz(
        Arabic _collocations Class
    Fc                 0   d| _         d| _        d| _        d| _        || _        ddddddd	}i | _        d
| _        t          j        d|          | _	        t          j                                        D ]"}t          j        |         d         | j        |<   #d S )N      '~r            )id	vocalizedunvocalizedrulecategorynoteTcollocationsv)minimaxi	delimiterunknown_delimitershow_delimitercollo_cachecache_enabledcollocationdictionarycollocationDictionary
collo_dictcconstGENERAL_COLLOCATIONSkeys)selfr   COLLOCATION_DICTIONARY_INDEXkeys       R/var/www/html/speakWrite/venv/lib/python3.11/site-packages/maskouk/collocations.py__init__zCollocationClass.__init__   s    		 #',(
 (
$ !/E46 6.3355 	3 	3C(-c2 S!!	3 	3    c                 n    t          || j                  | _        t          || j                  | _        dS )z
        set minimum
        Nminr   maxr   )r"   min0s     r%   set_minzCollocationClass.set_min7   .    
 di((	di((			r'   c                 n    t          || j                  | _        t          || j                  | _        dS )z
        set maximum
        Nr)   )r"   max0s     r%   set_maxzCollocationClass.set_max>   r.   r'   c                     || _         dS )z
        set the delimiter for collocations output
        @param delimiter : the given delimiter.
        @type delimiter : one unicode char.
        N)r   r"   r   s     r%   set_delimiterzCollocationClass.set_delimiterE   s     #r'   c                     || _         dS )z
        set the delimiter for unvocalized collocations output
        @param delimiter: the given delimiter.
        @type delimiter: one unicode char.
        N)r   r3   s     r%   set_unknown_delimiterz&CollocationClass.set_unknown_delimiterM   s     "+r'   c                     d| _         dS )B
        Enable the option to show collocation delimiters
        TNr   r"   s    r%   enable_show_delimiterz&CollocationClass.enable_show_delimiterU   s     #r'   c                     d| _         dS )r8   FNr9   r:   s    r%   disable_show_delimiterz'CollocationClass.disable_show_delimiter[   s     $r'   c                     d| _         dS )z3
        Enable the option to enable cache
        TNr   r:   s    r%   enable_cachezCollocationClass.enable_cachea   s     "r'   c                     d| _         dS )zC
        disable the option to show collocation delimiters
        FNr?   r:   s    r%   disable_cachezCollocationClass.disable_cacheg   s     #r'   c                    d                     |          }t          |          dk     rdS t          |          dk    r| j        r|| j        v r|S ntt          j                            |          sdS d                     |          }| j        r|| j        v r|S | j                            |          }t          |          dk    r*| j        r!|| j        vr|d         }|d         | j        |<   |S |d         t          j
        t          j        t          j        t          j        t          j        fv r|d         }|dd         }| j                            |          }t          |          dk    rQ| j        rH|| j        vr?|d         d         }|dk    r||z   | j        |<   n|| j        |<   || j        vr
|| j        |<   |S dS dS dS )	uc  
        Return The vocalized text if the word list is collocated.
        
        Example:
            >>> wlist = [u"كرة", u"القدم"]
            >>> # test if collocation exists
            >>> results = mydict.is_collocated(wlist)
            >>> print("inuput:", wlist)
            >>> print("output:",results)
            inuput: ['كرة', 'القدم']
            output: كرة القدم
            >>> wlist = [u"شمس", u"النهار"]
            >>> results = mydict.is_collocated(wlist)
            >>> print("inuput:", wlist)
            >>> print("output:",results)
            inuput: ['شمس', 'النهار']
            output: False

        @param wordlist: word of list, 2 or more words.
        @type wordlist: list of unicode.
        @return : The collocation as a key if exists. else False.
        @rtype: dict/None.
         r   Fr
   r   r   N )joinlenr   r   r   	token_patsearchr   lookuparabyFEHWAWBEHLAMKAF)r"   wordlistr$   idlistfirst_entryfirst_letternew_keyvocalized_collocations           r%   is_collocatedzCollocationClass.is_collocatedm   s   6 ii!!x==??5]]1__ ! cT-=&=&=
"))#.. 9	5 ))H%%C! cT-=&=&=
 _++C00F 6{{a% FcT5E.E.E"()K-8-ED$S)

 q6eiEIuy  #&q6L!!""gG "_33G<<F 6{{a''- :t///4:1Ik4J14::8D 596 0 5 5 9N 0 5 (/$2B'B'B$9 %)$4W$=  #
!&uur'   c                 P   |}g }t          |          |k    rg }t          |          D ],}|                                }|                    d|           -|r~|                     |          ri|                     |          }|r|                    |           n<|                    |                                           |                    |           t          |          |k    |                                 |                    |           |                                 |S )u  
        Lookup for ngram (min number of words), in the word list.
        return a list of single words and collocations.
        
        Example:
            >>> # detect collocations in phrase    
            >>> text = u"لعبنا مباراة كرة القدم في بيت المقدس"
            >>> wordlist = araby.tokenize(text)
            >>> results  = mydict.ngramfinder(2, wordlist)
            >>> print("inuput:", text)
            >>> print("output:",results)
            inuput: لعبنا مباراة كرة القدم في بيت المقدس
            output: ['لعبنا', 'مباراة', 'كرة القدم', 'في', 'بيت المقدس']
        
        @param wordlist: word of list, 2 or more words.
        @type wordlist: list of unicode.
        @param mini: minimum number of words in the collocation
        @type mini: integer.        
        @return : list of words and collocations, else False.
        @rtype: list /None.
        r   )	rG   rangepopinsertis_possible_collocationrW   appendextendreverse)	r"   r   rQ   listenewlistsublisticurrentresults	            r%   ngramfinderzCollocationClass.ngramfinder   s#   , %jjD  G4[[ + +))++q'****  *477@@ * ++G44 *NN6****NN7;;==111LL)))1 %jjD  4 	ur'   c                     t           j                                        D ]<}t          j        |t           j        |                             dd          |          }=|S )u[  
        Lookup for long collocations in a text.
        return a  vocalized words collocations.
        
        Example:
            
            >>> # get Long collocations
            >>> text = u" قلت لهم السلام عليكم ورحمة الله تعالى وبركاته ثم رجعت"
            >>> results  = mydict.lookup4long_collocations(text)
            >>> print("inuput:", text)
            inuput:  قلت لهم السلام عليكم ورحمة الله تعالى وبركاته ثم رجعت
            >>> print("output:",results)   
            output:  قلت لهم السّلامُ عَلَيكُمْ وَرَحْمَةُ اللهِ تَعَالَى وبركاته ثم رجعت
            
        @param inputtext: given text
        @type inputtext:  unicode.
        @return : text.
        @rtype: unicode.
        r   rE   )r   r    r!   resubget)r"   	inputtextks      r%   lookup4long_collocationsz)CollocationClass.lookup4long_collocations  s_    ( ,1133 	 	A6#>q#A#E#Ec$ $ IIr'   c                    d}g }g }|D ]3}|| j         vr|                     |          }|| j         |<   |r|| j         |                                         v r|                                 |                    d           |                    d           | j         |         |         }|                                 |                    |                    d                     nU|                    d           |                    |           n*|                    d           |                    |           |}5||fS )u  
        Lookup for all ngrams , in the word list.
        return a list of vocalized words collocations
        
        Example:
        
            >>> # detect collocations in phrase    
            >>> text = u"لعبنا مباراة كرة القدم في بيت المقدس"
            >>> wordlist = araby.tokenize(text)
            >>> results   = mydict.lookup(wordlist)
            >>> print("inuput:", text)
            >>> print("output:",results)
            inuput: لعبنا مباراة كرة القدم في بيت المقدس
            output: (['لعبنا', 'مباراة', 'كُرَة', 'الْقَدَمِ', 'في', 'بَيْت', 'الْمَقْدِسِ'], ['CO', 'CO', 'CB', 'CI', 'CO', 'CB', 'CI'])
        
        
        @param wordlist: word of list, 2 or more words.
        @type wordlist: list of unicode.
        @return : dict of words attributes like dict {'vocalized':
        vocalizedword list, 'category': categoryOf_collocation}. else False.
        @rtype: dict of dict /None.
        FCBCIrD   CO)r   is_collocated_wordr!   rZ   r]   r^   split)r"   rQ   previoustaglistvocalized_listwordre   vocalized_tuples           r%   rJ   zCollocationClass.lookup  sT   4  	 	D4+++0066)/ & ,4+H5::<<<<KKMMMNN4(((NN4(((&*&6x&@&FO"&&((("))/*?*?*D*DEEEENN4((("))$////t$$$%%d+++HH ''r'   c                 v   t          j        d|          ri S i }| j                            |d          }t	          |          dk    r:|D ]6}|d                             d          d         }|d         r|d         ||<   7n|d         t          j        t          j        t          j	        t          j
        t          j        fv ru|d         }|dd	         }| j                            |d          }t	          |          dk    r4|D ]1}|d                             d          d         }||d         z   ||<   2|S )
u  
        Return The list of collocations started by given word, else False.
        
        Example:
            >>> # get all collocations for a specific word
            >>> word1 = u"كرة"
            >>> results  = mydict.is_collocated_word(word1)
            >>> print("inuput:", word1)
            >>> print("output:",results)
            inuput: كرة
            output: {'القدم': 'كُرَة الْقَدَمِ'}
            >>>
            >>> word = u"بيت"
            >>> # get all collocations for a specific word
            >>> results  = mydict.is_collocated_word(word)
            >>> print("inuput:", word)
            >>> print("output:",results)
            inuput: بيت
            output: {'العدة': 'بَيْت الْعِدَّةِ', 'المستأجر': 'بَيْت الْمُسْتَأْجِرِ', 'المشتري': 'بَيْتِ الْمُشْتَرِي', 'الرجل': 'بَيْت الرَّجُلِ', 'البناء': 'بَيْت الْبِنَاءِ', 'الزوج': 'بَيْت الزَّوْجِ', 'المال': 'بيت المال', 'المقدس': 'بَيْت الْمَقْدِسِ', 'البائع': 'بَيْت الْبَائِعِ', 'الخلاء': 'بَيْت الْخَلَاءِ', 'الأب': 'بَيْت الْأَبِ', 'الله': 'بَيْت اللّهِ'}
        
        @param word: input word.
        @type word: unicode.
        @return : dict of collocations and vocalized collocations if exists, the keys are second words in collocations. else False.
        @rtype: dict/Boolean.
        z[::pounctuation::]T)
singlewordr
   r   rD   r   r   N)rh   rI   r   rJ   rG   rs   rK   rL   rM   rN   rO   rP   )r"   rw   re   rR   itemr$   rT   new_words           r%   rr   z#CollocationClass.is_collocated_wordU  sU   4 9)400 	I''4'@@ v;;! 4 4=)//44Q7$ 4"&{"3F3K	4 Aw59eiEIY  #Aw8//t/LL v;;!## & G G"=177<<Q?&2T+5F&Fsr'   rE   r   c                 6   |}t          |          |k     rdS |d         }|d         }t          j        |          }t          j        |          }t          j                            |          rt          j                            |          sdS |t          j        v rdS |t          j        v rdS |t          j        vr|	                    d          r|	                    d          rdS |
                    d          r|	                    d          rd	S |
                    d          r|
                    d          rd
S |dk    r%|t          j        v r|	                    d          rdS |
                    d          r|	                    d          rdS dS )uv  
        Guess if the given list is a possible collocation
        This is used to collect unkown collocations, from user input
        return True oor false
        
        Example:
        
            >>> text = u"ظهر رئيس الوزراء السيد عبد الملك بن عامر ومعه أمير دولة غرناطة ونهر النيل انطلاق السباق"
            >>> wordlist = araby.tokenize(text)
            >>> previous = "__"
            >>> for wrd in wordlist:
            >>>    wlist = [previous, wrd]
            >>>    results  = mydict.is_possible_collocation(wlist, length = 2)
            >>>    print("inuput:", wlist)
            >>>    print("output:", results)   
            >>>    previous  = wrd
            inuput: ['__', 'ظهر']
            output: 100
            inuput: ['ظهر', 'رئيس']
            output: 100
            inuput: ['رئيس', 'الوزراء']
            output: 100
            inuput: ['الوزراء', 'السيد']
            output: 20
            inuput: ['السيد', 'عبد']
            output: 100
            inuput: ['عبد', 'الملك']
            output: 15
            inuput: ['الملك', 'بن']
            output: 100
            inuput: ['بن', 'عامر']
            output: 15
            inuput: ['عامر', 'ومعه']
            output: 100
            inuput: ['ومعه', 'أمير']
            output: 100
            inuput: ['أمير', 'دولة']
            output: 100
            inuput: ['دولة', 'غرناطة']
            output: 10
            inuput: ['غرناطة', 'ونهر']
            output: 100
            inuput: ['ونهر', 'النيل']
            output: 100
            inuput: ['النيل', 'انطلاق']
            output: 100
            inuput: ['انطلاق', 'السباق']
            output: 100

        @param wordlist: word of list, 2 or more words.
        @type wordlist: list of unicode.
        @param length: minimum number of words in the collocation
        @type length: integer.        
        @return : the rule of found collocation, 100 default.
        @rtype: interger.
        r   r
   
      u   ال   u   ة   (   rE   2   u   ات<   d   )rG   rK   strip_tashkeelr   rH   rI   ADDITIONAL_WORDSNAMED_PRIORSPECIAL_DEFINED
startswithendswithtab_noun_context)	r"   rQ   contextlengthlist2item_v1item_v2item1item2s	            r%   r\   z(CollocationClass.is_possible_collocation  s   r u::f1AhGAhG(11E(11E $++E22 ##E**r&111r&,,,rv555$$W-- 53C3CG3L3L 2^^E** u/?/?/H/H 2 ^^E** u~~e/D/D 2 nnF4K)K)K$$W-- *L2 ^^G,, 1A1A'1J1J 23r'   N)F)rE   r   )__name__
__module____qualname____doc__r&   r-   r1   r4   r6   r;   r=   r@   rB   rW   rf   rm   rJ   rr   r\    r'   r%   r   r      s        3 3 3 3<) ) )) ) )# # #+ + +# # #$ $ $" " "# # #_ _ _B5 5 5n  26( 6( 6(n8 8 8va a a a a ar'   r   c                     dt           _        t                      } g d}d}|dz  }|dz  }|dz  }|                    d          }|                     d           |                     d	           |                                  t          d
          D ]M}|                     |          }t          |d
                    |                              d                     N|}t          j                                        D ]}t          |                    d          t          j        |                             dd                              d                     ||v rt          d           t!          j        |t          j        |                             dd          |          }t          d|                    d                     dS )z
    main test zdata/collocations.sqlite)
u   قبلu   صلاةu
   الفجرu   كرةu
   القدمu   فيu   دولةu   قطرu   الآنu   أنub  تغمده الله برحمته . أشهد أن لا إله إلا الله وحده لا
     شريك له . أشهد أن محمدا عبده ورسوله .
 صلى الله عليه وآله وصحبه وسلم . أشهد أن لا إله إلا الله .
 أشهد أن محمدا رسول الله . صلى الله عليه وسلم
      .
    u   والحمد لله . الحمد لله . بالحمد لله .
 بسم الله الرحمن الرحيم . عبد الله . بعبد الله .u)   بسم الله الرحمن الرحيمz  taha zerrouki rD   #@r
   	utf8r   rE   okzlong collo, N)r   FILE_DB_collocationClassrs   r4   r6   r;   rY   rJ   printrF   encoder   r    r!   rj   rh   ri   )collorQ   wordsrc   ra   rk   rl   s          r%   mainlyr     s    %@!E< < <HE 
 V VE	99E	  E{{3H		$$$	!!! 1XX 5 5,,x((aG$$++F334444I(--//  ahhv#A&**333::6BB	D 	D 	D	>>4LLLFAv:1=AA#rJJ	 		 
.)**62277777r'   __main__)r   maskouk.collocationdictionaryr   maskouk.collocation_constcollocation_constr   pyarabic.arabyrK   rh   r   r   r   r   r'   r%   <module>r      s     > = = = = = * * * * * *       				] ] ] ] ] ] ] ]~$8 $8 $8P z
FHHHHH r'   