
    -i                     ^   d Z ddlmZmZmZmZ ddlZddlZ	 e n# e	$ r e
ZY nw xY wedk    rddlZnddlmZ ddlZdZdZd	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZ dZ!dZ"dZ#dZ$dZ%dZ&dZ'dZ(d Z)d!Z*d"Z+d#Z,d$Z-d%Z.d&Z/d'Z0d(Z1d)Z2d*Z3d+Z4d,Z5d-Z6d.Z7d/Z8d0Z9d1Z:d2Z;d3Z<d4Z=d5Z>d6Z?d7Z@d8ZAd9ZBd:ZCd;ZDd<ZEd=ZFd>ZGd?ZHd@ZIdAZJdBZKdCZLdDZMdEZNdFZOdGZPdHZQdIZRdJZSdKZTdLZUdMZVdNZWdOZXdPZYdQZZdRZ[dSZ\dTZ]dUZ^dVZ_dWZ`dXZadYZbdZZcd[Zdd\Zed]Zfd^Zgd_ZhdTZid`ZjdaZkdbZldcZmddZndeZodfZpdgZqdhZrdiZsdjt                    g eeeeeeee e!e"e#e$e%e&e'e(e)e*e+e,e.e/e0e1e2e3e4e5e6e7eeeeee          Zue;e<e=e>e?e@eAeBeCeDf
ZveEeFeGeHeIeJeKeLeMeNf
ZweOePeQeReSeTeUeVeWeXf
ZxeaebecedeeefehegfZyeaebecedeeefehfZzedeeefehfZ{eaebecfZ|e-Z}elemeneofZ~eeee9e:eefZeeeee^e6eifZee5e7e6fZe7ee6ekfZe5eejfZeefZeiejekfZeeeeeeeee e+e,e.e/e0e2e4e5e7fZeee!e"e#e$e%e&e'e(e)e*e1e3fZi ededkedledledmednedoe dpe!dqe"dre#dse$dte%due&dve'dwe(dxe)dyi e*dze+d{e,d|e.d}e/d~e0de1de2de3de4de5de7dedededededediZi ededededededede de!de"de#de$de%de&de'de(de)di e*de+de,de.de/de0de1de2de3de4de5de7dede-dede6dededededeadebdecdeddeedefdegdehdiZdjt                    e          Zdjt                    ez          Zdjt                    ey          Z ej        ddjt                    ez          z   dz   ej                  Z ej        ddjt                    ez          ddjt                    e|          dej                  Z ej        ddjt                    e{          z   dz   ej                  Z ej        ddjt                    ey          z   dz   ej                  Z ej        ddjt                    e          z   dz   ej                  Z ej        ddjt                    e          z   dz   ej                  Z ej        ddjt                    e~          z   dz   ej                  Z ej        dej                  Z ej        dej                  Z ej        d          Z ej        delemeodej                  Z ej        dej                  Z ej        dej                  Zd  edd          D             Zd Zd ZdÄ ZdĄ Zdń ZdƄ ZdǄ ZdȄ ZdɄ Zdʄ Zd˄ Zd̄ Zd̈́ Zd΄ Zdτ ZdЄ Zdф Zd҄ ZddӄZdԄ ZdՄ Zdք Zdׄ Zd؄ Zdل Zdڄ Zdۄ Zd܄ Zd݄ Zdބ Zd߄ Zd Zd Zd Zd Zd Zd Zd ZÐddZd Zd ZƐddZd Zd ZɐddZd Zd Zd Zd Zdjg g fdZdedefdZd Zd ZԐd	dZedk    r ed edd                      ed e͐d d                      ed eɐd d                      ed e͐dd                      ed eɐdd                      ed edd                      ed edd                     dZg dZ eeצ          Z e֐d            ee٦           dS dS (
  a  
Arabic module

Features:
=========
 - Arabic letters classification
 - Text tokenization
 - Strip Harakat (all, except Shadda, tatweel, last_haraka)
 - Sperate and  join Letters and Harakat
 - Reduce tashkeel
 - Mesure tashkeel similarity (Harakats, fully or partially vocalized, similarity with a template)
 - Letters normalization (Ligatures and Hamza)

@author: Taha Zerrouki
@contact: taha dot zerrouki at gmail dot com
@copyright: Arabtechies, Arabeyes, Taha Zerrouki
@license: GPL
@date:2010/03/01
@version: 0.1
    )absolute_importprint_functionunicode_literalsdivisionN__main__   )stacku   ،u   ؛u   ؟u   ءu   آu   أu   ؤu   إu   ئu   اu   بu   ةu   تu   ثu   جu   حu   خu   دu   ذu   رu   زu   سu   شu   صu   ضu   طu   ظu   عu   غu   ـu   فu   قu   كu   لu   مu   نu   هu   وu   ىu   يu   ٓu   ٔu   ٕu   ٠u   ١u   ٢u   ٣u   ٤u   ٥u   ٦u   ٧u   ٨u   ٩0123456789u   ۰u   ۱u   ۲u   ۳u   ۴u   ۵u   ۶u   ۷u   ۸u   ۹u   ٪u   ٫u   ٬u   ٭u   ٰu   ٱu   ۔u   ﻿u   ًu   ٌu   ٍu   َu   ُu   ِu   ّu   ْu   ۥu   ۦu   ﻻu   ﻷu   ﻹu   ﻵu   لاu   لأu   لإu   لآ                      	   
                                                            u   ألفu   باءu   تاءu   تاء مربوطةu   ثاءu   جيمu   حاءu   خاءu   دالu   ذالu   راءu   زايu   سينu   شينu   صادu   ضادu   طاءu   ظاءu   عينu   غينu   فاءu   قافu   كافu   لامu   ميمu   نونu   هاءu   واوu   ياءu   همزةu
   تطويلu   ألف ممدودةu   ألف مقصورةu   همزة على الألفu   همزة على الواوu   همزة تحت الألفu   همزة على الياءu   فتحتانu
   ضمتانu   كسرتانu   فتحةu   ضمةu   كسرةu   شدةu   سكون[]z]$|[u   ([^\wًٰ-ْ']+)u   ([\wًٰ-ْ']+)z		||||    ([^؀-ْz\s\d])u   ([^؀-ۿﭐ-﷿ﹰ-﻿ݐ-ݿ])u(   \s*([?؟!.,،:]+(?:\s+[?؟!.,،:]+)*)\s*c                 v    g | ]6}t          j        t          |                    d k    't          |          7S )Mn)unicodedatacategoryunichr).0xs     L/var/www/html/speakWrite/venv/lib/python3.11/site-packages/pyarabic/araby.py
<listcomp>r<   "  s;    bbbA+:NvVWyy:Y:Y]a:a:afQii:a:a:a       i  c                     | t           k    S )z)Checks if the given ``archar``Sukun Mark.)SUKUNarchars    r;   is_sukunrC   Z  s    U?r=   c                     | t           k    S )z0Checks if the given ``archar`` is  Shadda Mark. )SHADDArA   s    r;   	is_shaddarF   _  s    Vr=   c                     | t           k    S )z7Checks if the given ``archar`` Tatweel letter modifier.)TATWEELrA   s    r;   
is_tatweelrI   d  s    Wr=   c                     | t           v S )z,Checks if the given ``archar`` Tanwin Marks )TANWINrA   s    r;   	is_tanwinrL   i  s    Vr=   c                     | t           v S )zChecks if the given ``archar`` Arabic Tashkeel Marks (
        - FATHA, DAMMA, KASRA, SUKUN,
        - SHADDA,
        - FATHATAN, DAMMATAN, KASRATAn).)TASHKEELrA   s    r;   is_tashkeelrO   n  s    
 Xr=   c                     | t           v S )zYChecks if the given ``archar`` Arabic Harakat Marks (FATHA, DAMMA, KASRA, SUKUN, TANWIN).)HARAKATrA   s    r;   	is_harakarR   v  s    Wr=   c                     | t           v S )zQChecks if the given ``archar``  short Harakat Marks (FATHA, DAMMA, KASRA, SUKUN).)SHORTHARAKATrA   s    r;   is_shortharakarU   {  s    \!!r=   c                     | t           v S )zChecks for Arabic  Ligatures like LamAlef.
    (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE)
    @param archar: arabic unicode char
    @type archar: unicode
    @return:
    @rtype:Boolean
    )
LIGUATURESrA   s    r;   is_ligaturerX     s     Zr=   c                     | t           v S )zChecks for Arabic  Hamza forms.
    HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,
    ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE)
    @param archar: arabic unicode char
    @type archar: unicode
    @return:
    @rtype:Boolean
    )HAMZATrA   s    r;   is_hamzar[     s     Vr=   c                     | t           v S )zChecks for Arabic Alef forms.
    ALEFAT = (ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, ALEF_WASLA, ALEF_MAKSURA)
    @param archar: arabic unicode char
    @type archar: unicode
    @return:
    @rtype:Boolean
    )ALEFATrA   s    r;   is_alefr^     s     Vr=   c                     | t           v S )zChecks for Arabic Yeh forms.
    Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA
    @param archar: arabic unicode char
    @type archar: unicode
    @return:
    @rtype:Boolean
    )YEHLIKErA   s    r;   
is_yehlikera          Wr=   c                     | t           v S )zChecks for Arabic Waw like forms.
    Waw forms : WAW, WAW_HAMZA, SMALL_WAW
    @param archar: arabic unicode char
    @type archar: unicode
    @return:
    @rtype:Boolean
    )WAWLIKErA   s    r;   
is_wawlikere     rb   r=   c                     | t           v S )zChecks for Arabic Teh forms.
    Teh forms : TEH, TEH_MARBUTA
    @param archar: arabic unicode char
    @type archar: unicode
    @return:
    @rtype:Boolean
    )TEHLIKErA   s    r;   is_tehrh     rb   r=   c                     | t           v S )zChecks for Arabic Small letters.
    SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH
    @param archar: arabic unicode char
    @type archar: unicode
    @return:
    @rtype:Boolean
    )SMALLrA   s    r;   is_smallrk     s     U?r=   c                     | t           v S )zChecks for Arabic Weak letters.
    Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA
    @param archar: arabic unicode char
    @type archar: unicode
    @return:
    @rtype:Boolean
    )WEAKrA   s    r;   is_weakrn          T>r=   c                     | t           v S )zChecks for Arabic Moon letters.
    Moon Letters :
    @param archar: arabic unicode char
    @type archar: unicode
    @return:
    @rtype:Boolean
    )MOONrA   s    r;   is_moonrr     ro   r=   c                     | t           v S )zChecks for Arabic Sun letters.
    Moon Letters :
    @param archar: arabic unicode char
    @type archar: unicode
    @return:
    @rtype:Boolean
    )SUNrA   s    r;   is_sunru     s     S=r=   c                 8    t                               | d          S )zreturn Arabic letter order between 1 and 29.
    Alef order is 1, Yeh is 28, Hamza is 29.
    Teh Marbuta has the same ordre with Teh, 3.
    @param archar: arabic unicode char
    @type archar: unicode
    @return: arabic order.
    @rtype: integer
    r   )ALPHABETIC_ORDERgetrA   s    r;   orderry     s     ***r=   c                 8    t                               | |          S )aF  return Arabic letter name in arabic.     Alef order is 1, Yeh is 28,
    Hamza is 29. Teh Marbuta has the same ordre with Teh, 3.
    @param archar: arabic unicode char
    @type archar: unicode
    @param default_name: arabic unicode char
    @type default_name: unicode    
    @return: arabic name.
    @rtype: unicode
    )NAMESrx   )rB   default_names     r;   namer}     s     99V\***r=   c                      g } t          dd          D ]b}	 |                     t          |                     &# t          $ r% |                     t	          |                     Y Tt
          $ r Y _w xY w| S )u   return a list of arabic characteres .
    Return a list of characteres between ، to ْ
    @return: list of arabic characteres.
    @rtype: unicode
    r>   iS  )rangeappendr8   	NameErrorchr
ValueError)mylistis     r;   arabicranger     s     F67##  	MM&))$$$$ 	" 	" 	"MM#a&&!!!!! 	 	 	D	Ms   "9,A3'	A32A3c                 >    t          j        t          |           rdS dS )zChecks if the arabic word  contains shadda.
    @param word: arabic unicode char
    @type word: unicode
    @return: if shadda exists
    @rtype:Boolean
    TF)researchrE   words    r;   
has_shaddar      s"     
y t5r=   c                 b    |                                  rdS | D ]}t          |          r ndS dS )zChecks if the arabic word is vocalized.
    the word musn't  have any spaces and pounctuations.
    @param word: arabic unicode char
    @type word: unicode
    @return: if the word is vocalized
    @rtype:Boolean
    FT)isalpharO   )r   chars     r;   is_vocalizedr   /  sN     ||~~ u  t 	E	 u4r=   c                 P    t          t          j        t          |                     S )zChecks if the arabic text is vocalized.
    The text can contain many words and spaces
    @param text: arabic unicode char
    @type text: unicode
    @return: if the word is vocalized
    @rtype:Boolean
    )boolr   r   HARAKAT_PATTERNtexts    r;   is_vocalizedtextr   A  s     	/400111r=   c                 >    t                               |           rdS dS )a;   Checks for an  Arabic standard Unicode block characters
    An arabic string can contain spaces, digits and pounctuation.
    but only arabic standard characters, not extended arabic
    @param text: input text
    @type text: unicode
    @return: True if all charaters are in Arabic block
    @rtype: Boolean
    FT)ARABIC_STRINGr   r   s    r;   is_arabicstringr   L  s$     D!! u4r=   c                 >    t                               |           rdS dS )z Checks for an  Arabic Unicode block characters
    @param text: input text
    @type text: unicode
    @return: True if all charaters are in Arabic block
    @rtype: Boolean
    FT)ARABIC_RANGEr   r   s    r;   is_arabicranger   Z  s$     4   u4r=   c           	         t          |           dk    rdS t          j        dt          t          t
          d|           rdS t          | d                   s| d         t          t          fv rdS t          j	        dt          z  |           rdS t          j	        dt          dt          t          t          d|           rdS t          j        t          t          z   |           rdS d	S )
a%   Checks for an valid Arabic  word.
    An Arabic word not contains spaces, digits and pounctuation
    avoid some spelling error, TEH_MARBUTA must be at the end.
    @param word: input word
    @type word: unicode
    @return: True if all charaters are in Arabic block
    @rtype: Boolean
    r   Fr3   ])z^(.)*[%s](.)+$z^(.)*[z]([^z])(.)+$T)lenr   r   LAM_ALEFLAM_ALEF_HAMZA_ABOVELAM_ALEF_MADDA_ABOVErR   	WAW_HAMZA	YEH_HAMZAmatchALEF_MAKSURATEH_MARBUTADAMMAKASRAFATHArE   r   s    r;   is_arabicwordr   f  s     4yyA~~u	h 4 46J6J6JLMQ
S 
S u	47		 tAw9i*@@@u	#l2D	9	9 u	;;uueee56:
< 
< u	6F?D	)	) utr=   c                     | d         S )z
    Return the first char
    @param word: given word
    @type word: unicode
    @return: the first char
    @rtype: unicode char
    r    r   s    r;   
first_charr     s     7Nr=   c                     | dd         S )z
    Return the second char
    @param word: given word
    @type word: unicode
    @return: the first char
    @rtype: unicode char
    r   r   r   r   s    r;   second_charr     s     !9r=   c                     | dd         S )z
    Return the last letter
    example: zerrouki; 'i' is the last.
    @param word: given word
    @type word: unicode
    @return: the last letter
    @rtype: unicode char
    Nr   r   s    r;   	last_charr     s     9r=   c                     | dd         S )z
    Return the second last letter example: zerrouki; 'k' is the second last.
    @param word: given word
    @type word: unicode
    @return: the second last letter
    @rtype: unicode char
    r   r   r   s    r;   secondlast_charr     s     2;r=   c                 l    | s| S t          |           r t          D ]}|                     |d          } | S )u  Strip Harakat from arabic word except Shadda.
    The striped marks are :
        - FATHA, DAMMA, KASRA
        - SUKUN
        - FATHATAN, DAMMATAN, KASRATAN,

    Example:
        >>> text = u"الْعَرَبِيّةُ"
        >>> strip_harakat(text)
        >>> العربيّة

    @param text: arabic text.
    @type text: unicode.
    @return: return a striped text.
    @rtype: unicode.
    r   )r   rQ   replacer   r   s     r;   strip_harakatr     sJ    (  *	d		 * 	* 	*D<<b))DDKr=   c                 ^    | r*t          |           rt          j        t          d|           S | S )u  Strip the last Haraka from arabic word except Shadda.
    The striped marks are :
        - FATHA, DAMMA, KASRA
        - SUKUN
        - FATHATAN, DAMMATAN, KASRATAN

    Example:
        >>> text = u"الْعَرَبِيّةُ"
        >>> strip_lastharaka(text)
        الْعَرَبِيّة

    @param text: arabic text.
    @type text: unicode.
    @return: return a striped text.
    @rtype: unicode.
    r   )r   r   subLASTHARAKA_PATTERNr   s    r;   strip_lastharakar     s6    "  9 	96,c4888Kr=   c                 l    | s| S t          |           r t          D ]}|                     |d          } | S )u  Strip vowels from a text, include Shadda.
    The striped marks are :
        - FATHA, DAMMA, KASRA
        - SUKUN
        - SHADDA
        - FATHATAN, DAMMATAN, KASRATAN,, , .

    Example:
        >>> text = u"الْعَرَبِيّةُ"
        >>> strip_tashkeel(text)
        العربية

    @param text: arabic text.
    @type text: unicode.
    @return: return a striped text.
    @rtype: unicode.
    r   )r   rN   r   r   s     r;   strip_tashkeelr     sJ    $  *	d		 * 	* 	*D<<b))DDKr=   c                 N    | s| S t           D ]}|                     |d          } | S )up  Strip small_letters from a text
    The striped marks are :
        - Small Alef الألف الخنجرية, .
        -Small WAW
        -Small Yeh
    Example:
        >>> text = u"الرحمنٰ"
        >>> strip_small(text)
        الرحمن

    @param text: arabic text.
    @type text: unicode.
    @return: return a striped text.
    @rtype: unicode.
    r   )rj   r   r   s     r;   strip_smallr     s:        & &||D"%%Kr=   c                 8    |                      t          d          S )u#  
    Strip tatweel from a text and return a result text.

    Example:
        >>> text = u"العـــــربية"
        >>> strip_tatweel(text)
        العربية

    @param text: arabic text.
    @type text: unicode.
    @return: return a striped text.
    @rtype: unicode.

    r   )r   rH   r   s    r;   strip_tatweelr     s     <<$$$r=   c                 8    |                      t          d          S )u  
    Strip Shadda from a text and return a result text.

    Example:
        >>> text = u"الشّمسيّة"
        >>> strip_shadda(text)
         الشمسية

    @param text: arabic text.
    @type text: unicode.
    @return: return a striped text.
    @rtype: unicode.
    r   )r   rE   r   s    r;   strip_shaddar   *  s     <<###r=   c                 N    | s| S t           D ]}|                     |d          } | S )u  Strip arabic diacritics from a text
    The striped marks are :
        - Small Alef الألف الخنجرية, .
        - Harakat + Shadda
        - Quranic marks
        - Extended arabic diacritics
    Example:
        >>> text = u"الرحمنٰ"
        >>> strip_small(text)
        الرحمن

    @param text: arabic text.
    @type text: unicode.
    @return: return a striped text.
    @rtype: unicode.
    r   )
DIACRITICSr   r   s     r;   strip_diacriticsr   :  s:    "   & &||D"%%Kr=   c                 \    | r)t                               t          t          |           S | S )ud  Normalize Lam Alef ligatures into two letters (LAM and ALEF),
    and Tand return a result text.
    Some systems present lamAlef ligature as a single letter,
    this function convert it into two letters,
    The converted letters into  LAM and ALEF are :
        - LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE

    Example:
        >>> text = u"لانها لالء الاسلام"
        >>> normalize_ligature(text)
        لانها لالئ الاسلام

    @param text: arabic text.
    @type text: unicode.
    @return: return a converted text.
    @rtype: unicode.
    )LIGUATURES_PATTERNr   LAMALEFr   s    r;   normalize_ligaturer   Q  s1    $  C!%%dd&;TBBBKr=   uniformc                    |dk    s|dk    r|                      t          t                    } |                      t          t                    } |                      t          t                    } |                      t
          t                    } |                      t          t                    } |                      t          t                    } |                      t          t                    } n|                     t                    r{t          |           dk    rN| d         t          vr?| d         t          k    st          |           dk    rt          t          z   | dd         z   } nt          t          z   | dd         z   } |                      t          t          t          z             } t                               t          |           } | S )u  Standardize the Hamzat into one form of hamza,
    replace Madda by hamza and alef.
    Replace the LamAlefs by simplified letters.


    Example:
        >>> import pyarabic.araby as araby
        >>> text1 = u"جاء سؤال الأئمة عن الإسلام آجلا"
        >>> araby.normalize_hamza(text1)
        'جاء سءال الءءمة عن الءسلام ءءجلا'
        >>> araby.normalize_hamza(text1, method="tasheel")
            'جاء سوال الايمة عن الاسلام ا
    @param word: arabic text.
    @type word: unicode.
    @param method: how to convert hamzat (uniform, tasheel).
    @type method: unicode.
    @return: return a converted text.
    @rtype: unicode.
    tasheelu
   تسهيلr   r   r   N)r   
ALEF_MADDAr   ALEF_HAMZA_ABOVEALEF_HAMZA_BELOWHAMZA_ABOVEHAMZA_BELOWr   WAWr   YEH
startswithr   rQ   rE   HAMZAHAMZAT_PATTERNr   )r   methods     r;   normalize_hamzar   h  sO   ( f55||J--||,d33||,d33||K..||K..||Is++||Is++??:&& 	04yyA~~47'#9#9!W&&#d))q..t|d122h.u}tABBx/||J66!!%..Kr=   c                 t    t          j        dd                    t                    z   dz   t          |           S )u   
    converts TEH_MARBUTA to HEH
    Example:
        >>> text = 'محبة'
        >>> normalize_teh(text)
        'محبه'

    r1   r   r2   )r   r   joinr   HEHr   s    r;   normalize_tehr     s5     6$+...5t  r=   c                     |                      t          t          z   t                    } |                      t          t          z   t                    } t          j        t
          t          |           S )zP
    converts all alefs to ALEF_MAMDODA with the exception of Alef maksura

    )r   
SMALL_ALEFr   r   r   ALEFAT_PATTERNr   r   s    r;   normalize_alefr     sH     <<
</>>D<<Z/>>D6.$---r=   Fc                 |   t          j        |           }|j                                         t          j                    }t          j                    }t          }|                                }||v r|                                }||v |dk    r||v r*|                                 |                    |           n|t          k    rc|                                 |                    t                     |                    t                     |                    t                     n/|                    t                     |                    |           |                                }|dk    |rd
                    |j                  }t          j        dt          z  t          |          }t          j        t          t          t          |          }t          |          }|d
                    |j                  |fS d
                    |j                  d
                    |j                  fS )u*  
    separate the letters from the vowels, in arabic word,
    if a letter hasn't a haraka, the not definited haraka is attributed.
    return (letters, vowels)

    Example:
        >>> araby.separate(text)
        (u'العربية',
        u'َََََُْ')
        >>> letters, marks =araby.separate(text)
        >>> print letters.encode('utf8')
        العربية
        >>> print marks.encode('utf8')
        >>> for m in marks:
        ...     print araby.name(m)
        فتحة
        سكون
        فتحة
        فتحة
        فتحة
        فتحة
        ضمة

    @param word: the input word
    @type word: unicode
    @param extract_shadda: extract shadda as seperate text
    @type extract_shadda: Boolean
    @return: (letters, vowels)
    @rtype:couple of unicode
    Nr   z[^%s])r	   StackitemsreverserQ   poppushrE   r@   NOT_DEF_HARAKAr   r   r   rH   r   )	r   extract_shaddastack1lettersmarksvowelslast1wordlettersshaddaplacess	            r;   separater     s   > [F
LkmmGKMMEFJJLLE 6//

 6//
4--F?? IIKKKJJuf__ IIKKKJJuJJ~&&&LL    JJ~&&&LL

# 4--$  @hhw}--vh/+FFv&&96*, , #;//SXXek22LAA''%+)>)>??r=   c                    t          |           t          |          k    rdS t          j        |           }|j                                         t          j        |          }|j                                         t          j                    }|                                }|                                }t          }|dk    r|dk    r|t          k    rc|                                }||vr|                    |           |                    |           |t          k    r|                    |           n5|                    |           |t          k    r|                    |           |                                }|                                }|dk    r|dk    |
                                r|
                                sdS d                    |j                  S )u   joint the letters with the marks
    the length ot letters and marks must be equal
    return word

    Example:
        >>> letters = u"العربية"
        >>> marks = u'َََََُْ'
        >>> word = araby.joint(letters, marks)
        >>> print word.encode('utf8')
        اَلْعَرَبَيَةُ

    @param letters: the word letters
    @type letters: unicode
    @param marks: the word marks
    @type marks: unicode
    @return: word
    @rtype: unicode
    r   NF)r   r	   r   r   r   r   rQ   rE   r   r   is_emptyr   )	r   r   stack_letter
stack_mark
word_stacklast_letter	last_markr   tops	            r;   jointr     s   ( 7||s5zz!!r;w''L   U##JJ""$$K  IF


)t"3"3&  ..""C&  $$$OOK(((N**	***OOK(((N**	***"&&((NN$$	 

)t"3"3  !!## )
(;(;(=(= )uwwz'(((r=   c                 2    t          | |          dk     rdS dS )u  
    if the two words has the same letters and the same harakats, this fuction return True.
    The two words can be full vocalized, or partial vocalized

    Example:
        >>> word1 = u"ضَربٌ"
        >>> word2 = u"ضَرْبٌ"
        >>> araby.vocalizedlike(word1, word2)
        True

    @param word1: first word
    @type word1: unicode
    @param word2: second word
    @type word2: unicode
    @return: if two words have similar vocalization
    @rtype: Boolean
    r   FT)vocalized_similarity)word1word2s     r;   vocalizedliker  -  s#    $ E5))A--utr=   c                    t          j        |           }t          j        |          }t          j                    }|                                }|                                }t          }|dk    r|dk    r||k    r?|t          t
          t          fvr)|                                }|                                }n||vrT|t          t
          t          fv r>|                    |           |                                }|                                }n;||v r||vr|                                }n||vr||v r|                                }nn|dk    r|dk    |j        	                                 |
                                r|
                                sdS |dk    s|dk    rdS |rd                    |j                  S dS )u}  If the  word1 is like a wazn (pattern), and can return root
    the letters must be equal,
    the wazn has FEH, AIN, LAM letters.
    this are as generic letters.
    The two words can be full vocalized, or partial vocalized

    Example:
        >>> word1 = u"ضارب"
        >>> wazn = u"فَاعِل"
        >>> araby.waznlike(word1, wazn)
        True

    @param word1: input word
    @type word1: unicode
    @param wazn: given word template  وزن
    @type wazn: unicode
    @param extract_root: return root if True
    @type extract_root: boolean
    @return: if two words have similar vocalization
    @rtype: Boolean
    NFr   T)r	   r   r   rQ   FEHAINr   r   r   r   r   r   )	r  waznextract_rootr   stack2rootr   last2r   s	            r;   waznliker  H  s   , [F[F;==DJJLLEJJLLEF
4--ETMME>>eCc?::JJLLEJJLLEE&  UsCo%=%=IIeJJLLEJJLLEEf__f!4!4JJLLEE&  Uf__JJLLEE 4--ETMM  	JOO 	&//"3"3 	u	$%4--u 	774:&&&4r=   c                    t          |           sdS t          |          st          |           rdS t          |           } t          |          }t          j        |           }t          j        |          }|                                }|                                }|dk    r|dk    r~||k    r)|                                }|                                }nC|t
          k    r|t
          k    rn8|t
          k    r |t
          k    r|                                }nn|dk    r|dk    ~|                                r|                                sdS dS )u8  
    If the two words has the same letters and the same harakats, this fuction return True.
    The first word is partially vocalized, the second is fully
    if the partially contians a shadda, it must be at the same place in the fully

    Example:
        >>> word1 = u"ردّ"
        >>> word2=u"ردَّ"
        >>> araby.shaddalike(word1, word2)
        True

    @param partial: the partially vocalized word
    @type partial: unicode
    @param fully: the fully vocalized word
    @type fully: unicode
    @return: if contains shadda
    @rtype: Boolean
    TFN)r   r   r	   r   r   rE   r   )partialfullypstackvstackplastvlasts         r;   
shaddaliker    sB   ( g t :g#6#6 u G$$G%  E[!!F[FJJLLEJJLLE
4--ETMME>>JJLLEJJLLEEf__&f__&JJLLEE  4--ETMM OO &//"3"3 utr=   c           
      B   dt           dt          dt          dt          d	t          dt           dt
          dt          dt          dt          ddt           dt          dt          dt          dt
          g}| }|D ]}t          j	        |d	|          }|S )
ur  Reduce the Tashkeel, by deleting evident cases.

    Exmaple:
        >>> word = u"يُتَسََلَّمْنَ"
        >>> reduced = araby.reduce_tashkeel(word)
        >>> print reduced.encode('utf8')
        يُتسلّمن

    @param text: the input text fully vocalized.
    @type text: unicode.
    @return : partially vocalized text.
    @rtype: unicode.

    z(?<!(|z))()z(?=z(?<=\s(z))z(?<=r   )
r   r   r@   r   r   r   r   r   r   r   )r   patternsreducedpats       r;   reduce_tashkeelr    s     " #&##sssEEE5559 uuccc" uuccc" uuddd##  #ssCCC// )((%%0%
H( G + +&b'**Nr=   c                    t          j        |           }t          j        |          }|                                }|                                }d}t          }|dk    r|dk    r||k    r)|                                }|                                }n||v r||vr|                                }n||vr||v r|                                }nm|t          k    r|                                }nM|t          k    r|                                }n-|                                }|                                }|dz  }|dk    r|dk    |dk    r| S dS )u      if the two words has the same letters and the same harakats, this function return True.
    The two words can be full vocalized, or partial vocalized

    Example:
        >>> word1 = u"ضَربٌ"
        >>> word2 = u"ضَرْبٌ"
        >>> araby.vocalizedlike(word1, word2)
        True
        >>> word1 = u"ضَربٌ"
        >>> word2 = u"ضَرْبٍ"
        >>> araby.vocalized_similarity(word1, word2)
        -1

    @param word1: first word
    @type word1: unicode
    @param word2: second word
    @type word2: unicode
    @return: return if words are similar, else return negative number of errors
    @rtype: Boolean / int
    r   Nr   T)r	   r   r   rQ   rE   )r  r  r   r	  r   r  	err_countr   s           r;   r   r     s@   * [F[FJJLLEJJLLEIF
4--ETMME>>JJLLEJJLLEEf__f!4!4JJLLEE&  Uf__JJLLEE 

&





Q	# 4--ETMM$ 1}}ztr=   c                 r    t          j        dd| t           j                  } t          j        d|           }|S )u  
    Tokenize text into sentences.

    Example:
        >>> text = u"العربية لغة جميلة. والبلاد بعيدة، والشوق زائد"
        >>> tokens = araby.sentence_tokenize(text)
        >>> print(tokens)
        ‎‎['العربية لغة جميلة.', 'والبلاد بعيدة،', 'والشوق زائد']

    @param text: the input text.
    @type text: unicode.
    @return: list of sentences.
    @rtype: list.
    u   ([.,:;،؟?
])+([
	 ])+z	\1<SPLIT>z<SPLIT>)r   r   UNICODEsplit)r   	sentencess     r;   sentence_tokenizer"  0  s3     61,bjQQDD))Ir=   c                    | rt                    t          urgt                    t          urgt                              |           }d |D             }rfd|D             }rfdfd|D             }|S g S )uj  
    Tokenize text into words.

    Example:
        >>> text = u"العربية لغة جميلة."
        >>> tokens = araby.tokenize(text)
        >>> print u"\n".join(tokens)
        ‎العربية
        ‎لغة
        ‎جميلة
        .
        
    Example 2 (To remove tashkeel and filter out non-Arabic words:):
        >>> text = u"ِاسمٌ الكلبِ في اللغةِ الإنجليزية Dog واسمُ الحمارِ Donky"
        >>> tokenize(text, conditions=is_arabicrange, morphs=strip_tashkeel)
        ['اسم', 'الكلب', 'في', 'اللغة', 'الإنجليزية', 'واسم', 'الحمار']
        
    Example 3 (This structure will enable us to create functions on the fly and pass them:):
        >>> text = u"طلع البدر علينا من ثنيات الوداع"
        >>>tokenize(text, conditions=lambda x: x.startswith(u'ال'))
        ['البدر', 'الوداع']
    
    @param text: the input text.
    @type text: unicode.
    @param conditions: a list of conditions to be applied on tokens, like avoiding non arabic letters.
    @type conditions: one or list of conditions .
    @param morphs: a list of morphological change functions to be applied on tokens, like striping tashkeel or normalizing tokens.
    @type morphs: one or list of morphological functions .
    @return: list of words.
    @rtype: list.
    c                 z    g | ]8}t                               d |          t                               d |          9S r   )TOKEN_REPLACEr   )r9   toks     r;   r<   ztokenize.<locals>.<listcomp>j  s?    ]]]-BSBSTVX[B\B\]-##B,,]]]r=   c                 L    g | ]t          fd D                        S )c                 &    g | ]} |          S r   r   )r9   condr'  s     r;   r<   z'tokenize.<locals>.<listcomp>.<listcomp>m  s!    3U3U3U$DDII3U3U3Ur=   )all)r9   r'  
conditionss    @r;   r<   ztokenize.<locals>.<listcomp>m  s<    WWWcs3U3U3U3U*3U3U3U/V/VWcWWWr=   c                 (    D ]} ||           } | S )Nr   )r'  mmorphss     r;   morphztokenize.<locals>.morpho  s&     ! !A!C&&CC
r=   c                 &    g | ]} |          S r   r   )r9   r'  r0  s     r;   r<   ztokenize.<locals>.<listcomp>t  s!    333SeeCjj333r=   )typelistTOKEN_PATTERNr   )r   r,  r/  tokensr0  s    `` @r;   tokenizer6  C  s    @   
4''zl<<t##vhV$$T**]]]]] 	XWWWWVWWWF 	4    
 4333F333F	r=   r   returnc                    g }t                               |           D ]k}|                    | |                                |                                         |                                |                                d           l|S )u1  
    Tokenize text into words with their positions.

    Example:
        >>> text = "حدثنا ابن أبي عامر، قال: رايت مناما"
        >>> tokens = araby.tokenize_with_location(text)
        >>> print u"\n".join(tokens)
         [{'token': 'حدثنا', 'start': 0,  'end': 5},
          {'token': 'ابن',   'start': 6,  'end': 9}, 
          {'token': 'أبي',   'start': 10, 'end': 13}, 
          {'token': 'عامر',  'start': 14, 'end': 18}, 
          {'token': 'قال',   'start': 20, 'end': 23}, 
          {'token': 'رايت',  'start': 25, 'end': 29},
           {'token': 'مناما','start': 30, 'end': 35}
           ]
        
   
    @param text: the input text.
    @type text: unicode.
    @return: list of dict of (tokens, starts, ends).
    @rtype: list of dict.
    )tokenstartend)TOKEN_PATTERN_SPLITfinditerr   r:  r;  )r   r5  r   s      r;   tokenize_with_locationr>  y  s    . F$--d33  %++--45[[]]99;;
 
 	 	 	 	 Mr=   c                 b    t                               d |           } |                                 S )z
    c                 z    d                     |                     d                              dd                    S )Nz{} r    r   )formatgroupr   )r:   s    r;   <lambda>zfix_spaces.<locals>.<lambda>  s,    QWWQZZ5G5GR5P5P(Q(Q r=   )FIX_SPACES_PATr   stripr   s    r;   
fix_spacesrG    s,     QQSWXXD::<<r=   c           	      h   t          j        dt          z  d| t           j                  } t          j        dt          z  d| t           j                  } t          j        t          t
          z   t
          t          z   | t           j                  } t          j        dt          t          t          dt          dd| t           j                  } t          j        dt          dt          d	d| t           j                  } t          j        dt          dt          dd| t           j                  } | S )
z
    Correct most common errors on word 
    like repetetion of harakats,or tanwin befor alef
    @param text: input text
    @type text: unicode
    @return: corrected text
    @rtype: unicode
    z(?<=[\s\d])([%s])+r   z^([%s])+z(?<=[z])([z])+z([z])+(?=[r   )r   r   TASHKEEL_STRINGr  r   FATHATANr   r   r@   HARAKAT_STRINGrE   r   s    r;   autocorrectrL    s     6'9"TBJOOD6+/4"*EED6$x-D
CCD 6644{{{EEERSUX\^`^hiiD 66nnnfff=brzRRD 66~~~~~~Fr$QSQ[\\D Kr=   arc                     g }|dk    r.| D ]*}|                     t          j        ||                     +n(| D ]%}|                     t          ||                     &d                    |          S )z.
    write the word in full letter' names
    unicodez, )r   r6   r}   r   )r   langnamescs       r;   spellitrS    s    
 Ey 	1 	1ALL)!Q//0000	1  	% 	%ALLa$$$$::er=   zlike: u   مُتَوَهِّمًاu   متوهمًاzsim: u   ثمّu
   ثُمَّu   ثمu"   العربية: لغة جميلة.)u   العربية:u   لغةu
   جميلة.z use tokenizer%  )r   )F)rM  )__doc__
__future__r   r   r   r   r   r6   r8   r   r   __name__r	   r   pyarabic.stackpyarabicCOMMA	SEMICOLONQUESTIONr   r   r   r   r   r   r   BEHr   TEHTHEHJEEMHAHKHAHDALTHALREHZAINSEENSHEENSADDADTAHZAHr  GHAINrH   r  QAFKAFr   MEEMNOONr   r   r   r   MADDA_ABOVEr   r   ZEROONETWOTHREEFOURFIVESIXSEVENEIGHTNINEZERO_WONE_WTWO_WTHREE_WFOUR_WFIVE_WSIX_WSEVEN_WEIGHT_WNINE_WZERO_PONE_PTWO_PTHREE_PFOUR_PFIVE_PSIX_PSEVEN_PEIGHT_PNINE_PPERCENTDECIMAL	THOUSANDSSTAR	MINI_ALEF
ALEF_WASLA	FULL_STOPBYTE_ORDER_MARKrJ  DAMMATANKASRATANr   r   r   rE   r@   r   	SMALL_WAW	SMALL_YEHr   r   LAM_ALEF_HAMZA_BELOWr   SIMPLE_LAM_ALEFSIMPLE_LAM_ALEF_HAMZA_ABOVESIMPLE_LAM_ALEF_HAMZA_BELOWSIMPLE_LAM_ALEF_MADDA_ABOVEr   LETTERSNUMBERS_EASTNUMBERS_WESTNUMBERS_PERSrN   rQ   rT   rK   r   rW   rZ   r]   rm   r`   rd   rg   rj   rq   rt   rw   r{   HAMZAT_STRINGrK  rI  compiler  r   r   SHORTHARAKAT_PATTERNTASHKEEL_PATTERNr   r   r   r4  r<  r&  r   r   rE  r   r   rC   rF   rI   rL   rO   rR   rU   rX   r[   r^   ra   re   rh   rk   rn   rr   ru   ry   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r   r"  r6  strr3  r>  rG  rL  rS  printtext1wordlistwlr   r=   r;   <module>r     sK   (            
			    
FF   FFFzLLLL     		
 	 	
	
		

		
		

	

		
	 	 
		      !- - - 
(( PD P# Ps PK P Pt PS P$ PPP"P$(P*.P05P7:P<?PADPFIPKNPPUPWZPPP!P#'P)-P/2P47P9EPGJPLQPS]P %P '0P 2BP DMP Q Q
 c3tTUE4)ugvvw1ugvvw1 h%6Xxue
DueU+
Hh	',.B"%
 I{K
,
/ 
,.>
lJ
1 	c3%	<
3	9
%
	Y	*z+-=tT3c5#sCsCN D#tS$eS##sD#!!!(!-116:A!!!1&)2/3R9=r 
2 B R "%b  +.r 47 
2	 B	 R	 "%b	 +.r	 48	
 	"
 2
 B
 !$R
 */
 5? b $R *:2  r  .	).	.	 .	 '	.	
 	).	 	).	 .	 	).	 .	 	).	 .	 	).	 	).	 
9.	 .	  !.	" #.	 .	$ %.	& '.	( 
9).	* +.	, -.	. /.	0 1.	2 	)3.	4 	)5.	6 7.	8 9.	: ;.	< 
;=.	> ]?.	@ &A.	B (C.	D 3E.	 .	F ,3,omo	;	9	;
I	;[.	 .	^   '""((8$$ "*TCHHW$5$55<bjII BJJ 1 1 1 1388F3C3C3C3CDbjQQ  "rz$,)?)?"?$"F"$*. .  2:dSXXh%7%77$>
KK D388F#3#33d:BJGGD388F#3#33d:BJGGRZsxx
';'; ;d BBJOO 
:BJGG bj!?LL 
+,, 

&h(<(<>R>R>RTUWU_a a rz@"*N N GTTbbvv!6!6bbb
p  
  
  
  
    
" " "
     	 	 	                	+ 	+ 	+
+ 
+ 
+ 
+  *	 	 	  $2 2 2  	 	 	  >    	 	 	    8  .  2  .% % %$$ $ $   .  .) ) ) )X
 
 
. . .I@ I@ I@ I@X2) 2) 2)j  67 7 7 7t2 2 2j& & &R0 0 0`  & B 4 4 4 4l     B    6    z 
E(MM"=?PQQRRR	E'''	=AABBB	E(MM)];;<<<	E'''??@@@	E(MM'=99:::	E'''(CEVWWXXX	E'''(CEVWWXXX0EHHHH	%B	E/	E"IIIII+ s    %%