team4alfanous team mailing list archive
-
team4alfanous team
-
Mailing list archive
-
Message #00357
[Branch ~team4alfanous/alfanous/alfanous-git] Rev 546: Implement the ISO233 romanization
------------------------------------------------------------
revno: 546
committer: Assem Chelli <assem.ch@xxxxxxxxx>
timestamp: Thu 2013-01-31 12:45:44 +0100
message:
Implement the ISO233 romanization
removed:
src/alfanous-labs/Naq7ara/
src/alfanous-labs/Naq7ara/Arabizi.py
src/alfanous-labs/Naq7ara/Arabtex.py
src/alfanous-labs/Naq7ara/Buckwalter.py
src/alfanous-labs/Naq7ara/README.rst
src/alfanous-labs/Naq7ara/__init__.py
added:
src/alfanous/Romanization.py
modified:
src/README.rst
src/alfanous/Constants.py
src/alfanous/Misc.py
src/alfanous/Outputs.py
src/alfanous/QueryProcessing.py
src/alfanous/Searching.py
--
lp:alfanous
https://code.launchpad.net/~team4alfanous/alfanous/alfanous-git
Your team Alfanous team is subscribed to branch lp:alfanous.
To unsubscribe from this branch go to https://code.launchpad.net/~team4alfanous/alfanous/alfanous-git/+edit-subscription
=== modified file 'src/README.rst'
--- src/README.rst 2012-12-22 19:14:53 +0000
+++ src/README.rst 2013-01-31 11:45:44 +0000
@@ -33,6 +33,7 @@
:Indexing.py: the manager of the indexes reading.
:Searching.py: the module responsible of the basic search operation.
:Suggestions.py: the module responsible of suggestions.
+:Romanization.py: the module responsible of romanization systems.
:Threading.py: the module responsible of multi-processing during the search.
:Constants.py: a module that contains some constants used in the API.
:Exceptions.py: a module that contains some exceptions used in the API.
=== removed directory 'src/alfanous-labs/Naq7ara'
=== removed file 'src/alfanous-labs/Naq7ara/Arabizi.py'
=== removed file 'src/alfanous-labs/Naq7ara/Arabtex.py'
=== removed file 'src/alfanous-labs/Naq7ara/Buckwalter.py'
=== removed file 'src/alfanous-labs/Naq7ara/README.rst'
=== removed file 'src/alfanous-labs/Naq7ara/__init__.py'
=== modified file 'src/alfanous/Constants.py'
--- src/alfanous/Constants.py 2013-01-01 23:38:13 +0000
+++ src/alfanous/Constants.py 2013-01-31 11:45:44 +0000
@@ -19,80 +19,6 @@
-'''
-@author: Assem Chelli
-@contact: assem.ch [at] gmail.com
-@license: AGPL
-
-'''
-
-#:buckwalter code
-BUCKWALTER2UNICODE = {"'": u"\u0621", # hamza-on-the-line
- "|": u"\u0622", # madda
- ">": u"\u0623", # hamza-on-'alif
- "&": u"\u0624", # hamza-on-waaw
- "<": u"\u0625", # hamza-under-'alif
- "}": u"\u0626", # hamza-on-yaa'
- "A": u"\u0627", # bare 'alif
- "b": u"\u0628", # baa'
- "p": u"\u0629", # taa' marbuuTa
- "t": u"\u062A", # taa'
- "v": u"\u062B", # thaa'
- "j": u"\u062C", # jiim
- "H": u"\u062D", # Haa'
- "x": u"\u062E", # khaa'
- "d": u"\u062F", # daal
- "*": u"\u0630", # dhaal
- "r": u"\u0631", # raa'
- "z": u"\u0632", # zaay
- "s": u"\u0633", # siin
- "$": u"\u0634", # shiin
- "S": u"\u0635", # Saad
- "D": u"\u0636", # Daad
- "T": u"\u0637", # Taa'
- "Z": u"\u0638", # Zaa' (DHaa')
- "E": u"\u0639", # cayn
- "g": u"\u063A", # ghayn
- "_": u"\u0640", # taTwiil
- "f": u"\u0641", # faa'
- "q": u"\u0642", # qaaf
- "k": u"\u0643", # kaaf
- "l": u"\u0644", # laam
- "m": u"\u0645", # miim
- "n": u"\u0646", # nuun
- "h": u"\u0647", # haa'
- "w": u"\u0648", # waaw
- "Y": u"\u0649", # 'alif maqSuura
- "y": u"\u064A", # yaa'
- "F": u"\u064B", # fatHatayn
- "N": u"\u064C", # Dammatayn
- "K": u"\u064D", # kasratayn
- "a": u"\u064E", # fatHa
- "u": u"\u064F", # Damma
- "i": u"\u0650", # kasra
- "~": u"\u0651", # shaddah
- "o": u"\u0652", # sukuun
- "`": u"\u0670", # dagger 'alif
- "{": u"\u0671", # waSla
- #extended here
- "^": u"\u0653", # Maddah
- "#": u"\u0654", # HamzaAbove
-
- ":" : "\u06DC", # SmallHighSeen
- "@" : "\u06DF", # SmallHighRoundedZero
- "\"" : "\u06E0", # SmallHighUprightRectangularZero
- "[" : "\u06E2", # SmallHighMeemIsolatedForm
- ";" : "\u06E3", # SmallLowSeen
- "," : "\u06E5", # SmallWaw
- "." : "\u06E6", # SmallYa
- "!" : "\u06E8", # SmallHighNoon
- "-" : "\u06EA", # EmptyCentreLowStop
- "+" : "\u06EB", # EmptyCentreHighStop
- "%" : "\u06EC", # RoundedHighStopWithFilledCentre
- "]" : "\u06ED" #
-
- }
-
#:shaping table
=== modified file 'src/alfanous/Misc.py'
--- src/alfanous/Misc.py 2013-01-22 04:39:54 +0000
+++ src/alfanous/Misc.py 2013-01-31 11:45:44 +0000
@@ -6,9 +6,6 @@
import sys
import locale
-from alfanous.Constants import BUCKWALTER2UNICODE
-
-
#translation functions
import gettext
gettext.bindtextdomain( "fanous", "./locale" )
@@ -23,33 +20,9 @@
#get platform
SYS = sys.platform
-
-
-def buck2uni( string, ignore = "" , reverse = False ):
- """ encode & decode buckwalter transliteration """
-
- if reverse:
- mapping = {}
- for k, v in BUCKWALTER2UNICODE.items():
- #reverse the mapping buckwalter <-> unicode
- mapping[v] = k
- else:
- mapping = BUCKWALTER2UNICODE
-
- result = ""
- for char in string :
- if mapping.has_key( char ) and char not in ignore:
- result += mapping[char]
- else :
- result += char
- return result
-
-
-
FILTER_DOUBLES = filter_doubles = lambda lst:list( set( lst ) )
LOCATE = lambda source, dist, itm: dist[source.index( itm )] \
if itm in source else None
-
FIND = lambda source, dist, itm: [dist[i] for i in [i for i in range( len( source ) ) if source[i] == itm]]
=== modified file 'src/alfanous/Outputs.py'
--- src/alfanous/Outputs.py 2013-01-22 04:59:41 +0000
+++ src/alfanous/Outputs.py 2013-01-31 11:45:44 +0000
@@ -22,7 +22,7 @@
TODO offer some linguistic operations like vocalize
TODO derive using Quranic Corpus/functions
FIXME use xranges in domains
-FIXME vocalization_dict[terms[1]], test key existance before use
+FIXME vocalization_dict[terms[1]], test key existence before use
TODO include suggestions with search results
TODO +flag language
FIXME how to select the translation attached to results, what ID?
@@ -43,7 +43,7 @@
from alfanous.dynamic_resources.derivations_dyn import derivedict
from alfanous.TextProcessing import QArabicSymbolsFilter
from alfanous.Data import *
-from alfanous.Misc import buck2uni
+from alfanous.Romanization import transliterate
from alfanous.Misc import LOCATE, FIND, FILTER_DOUBLES
@@ -396,6 +396,8 @@
""" return the results of search for any unit """
if unit == "aya":
search_results = self._search_aya( flags )
+ elif unit == "translation":
+ search_results = self._search_translation( flags )
else:
search_results = {}
@@ -593,7 +595,7 @@
derivations = []
words_output[ "individual" ][ cpt ] = {
"word":term[1],
- "romanization": buck2uni( term[1], ignore = "" , reverse = True ) if romanization == "buckwalter" else None,
+ "romanization": transliterate( romanization, term[1], ignore = "" , reverse = True ) if romanization in self.DOMAINS["romanization"] else None,
"nb_matches":term[2],
"nb_ayas":term[3],
"nb_vocalizations": len( vocalizations ),#unneeded
@@ -609,7 +611,6 @@
annotation_word_query += u" ) "
words_output["global"] = {"nb_words":cpt - 1, "nb_matches":matches, "nb_vocalizations": nb_vocalizations_globale}
output["words"] = words_output;
-
#Magic_loop to built queries of Adjacents,translations and annotations in the same time
if prev_aya or next_aya or translation or annotation_aya:
adja_query = trad_query = annotation_aya_query = u"( 0"
=== modified file 'src/alfanous/QueryProcessing.py'
--- src/alfanous/QueryProcessing.py 2013-01-22 04:39:54 +0000
+++ src/alfanous/QueryProcessing.py 2013-01-31 11:45:44 +0000
@@ -76,7 +76,6 @@
from alfanous.TextProcessing import QArabicSymbolsFilter, unicode_
from alfanous.Misc import LOCATE, FIND, FILTER_DOUBLES
-# from alfanous.Misc import buck2uni
FEEDBACK = True
=== added file 'src/alfanous/Romanization.py'
--- src/alfanous/Romanization.py 1970-01-01 00:00:00 +0000
+++ src/alfanous/Romanization.py 2013-01-31 11:45:44 +0000
@@ -0,0 +1,191 @@
+# coding: utf-8
+
+
+## Copyright (C) 2009-2012 Assem Chelli <assem.ch [at] gmail.com>
+
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU Affero General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU Affero General Public License for more details.
+
+## You should have received a copy of the GNU Affero General Public License
+## along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+# Buckwalter Romanization letters mapping
+BUCKWALTER2UNICODE = {"'": u"\u0621", # hamza-on-the-line
+ "|": u"\u0622", # madda
+ ">": u"\u0623", # hamza-on-'alif
+ "&": u"\u0624", # hamza-on-waaw
+ "<": u"\u0625", # hamza-under-'alif
+ "}": u"\u0626", # hamza-on-yaa'
+ "A": u"\u0627", # bare 'alif
+ "b": u"\u0628", # baa'
+ "p": u"\u0629", # taa' marbuuTa
+ "t": u"\u062A", # taa'
+ "v": u"\u062B", # thaa'
+ "j": u"\u062C", # jiim
+ "H": u"\u062D", # Haa'
+ "x": u"\u062E", # khaa'
+ "d": u"\u062F", # daal
+ "*": u"\u0630", # dhaal
+ "r": u"\u0631", # raa'
+ "z": u"\u0632", # zaay
+ "s": u"\u0633", # siin
+ "$": u"\u0634", # shiin
+ "S": u"\u0635", # Saad
+ "D": u"\u0636", # Daad
+ "T": u"\u0637", # Taa'
+ "Z": u"\u0638", # Zaa' (DHaa')
+ "E": u"\u0639", # cayn
+ "g": u"\u063A", # ghayn
+ "_": u"\u0640", # taTwiil
+ "f": u"\u0641", # faa'
+ "q": u"\u0642", # qaaf
+ "k": u"\u0643", # kaaf
+ "l": u"\u0644", # laam
+ "m": u"\u0645", # miim
+ "n": u"\u0646", # nuun
+ "h": u"\u0647", # haa'
+ "w": u"\u0648", # waaw
+ "Y": u"\u0649", # 'alif maqSuura
+ "y": u"\u064A", # yaa'
+ "F": u"\u064B", # fatHatayn
+ "N": u"\u064C", # Dammatayn
+ "K": u"\u064D", # kasratayn
+ "a": u"\u064E", # fatHa
+ "u": u"\u064F", # Damma
+ "i": u"\u0650", # kasra
+ "~": u"\u0651", # shaddah
+ "o": u"\u0652", # sukuun
+ "`": u"\u0670", # dagger 'alif
+ "{": u"\u0671", # waSla
+ #extended here
+ "^": u"\u0653", # Maddah
+ "#": u"\u0654", # HamzaAbove
+
+ ":" : "\u06DC", # SmallHighSeen
+ "@" : "\u06DF", # SmallHighRoundedZero
+ "\"" : "\u06E0", # SmallHighUprightRectangularZero
+ "[" : "\u06E2", # SmallHighMeemIsolatedForm
+ ";" : "\u06E3", # SmallLowSeen
+ "," : "\u06E5", # SmallWaw
+ "." : "\u06E6", # SmallYa
+ "!" : "\u06E8", # SmallHighNoon
+ "-" : "\u06EA", # EmptyCentreLowStop
+ "+" : "\u06EB", # EmptyCentreHighStop
+ "%" : "\u06EC", # RoundedHighStopWithFilledCentre
+ "]" : "\u06ED" #
+
+ }
+
+# ISO233-2 romanization letter mapping
+ISO2UNICODE = { "ˌ": u"\u0621", # hamza-on-the-line
+ #"|": u"\u0622", # madda
+ "ˈ": u"\u0623", # hamza-on-'alif
+ "ˈ": u"\u0624", # hamza-on-waaw
+ #"<": u"\u0625", # hamza-under-'alif
+ "ˈ": u"\u0626", # hamza-on-yaa'
+ "ʾ": u"\u0627", # bare 'alif
+ "b": u"\u0628", # baa'
+ "ẗ": u"\u0629", # taa' marbuuTa
+ "t": u"\u062A", # taa'
+ "ṯ": u"\u062B", # thaa'
+ "ǧ": u"\u062C", # jiim
+ "ḥ": u"\u062D", # Haa'
+ "ẖ": u"\u062E", # khaa'
+ "d": u"\u062F", # daal
+ "ḏ": u"\u0630", # dhaal
+ "r": u"\u0631", # raa'
+ "z": u"\u0632", # zaay
+ "s": u"\u0633", # siin
+ "š": u"\u0634", # shiin
+ "ṣ": u"\u0635", # Saad
+ "ḍ": u"\u0636", # Daad
+ "ṭ": u"\u0637", # Taa'
+ "ẓ": u"\u0638", # Zaa' (DHaa')
+ "ʿ": u"\u0639", # cayn
+ "ġ": u"\u063A", # ghayn
+ #"_": u"\u0640", # taTwiil
+ "f": u"\u0641", # faa'
+ "q": u"\u0642", # qaaf
+ "k": u"\u0643", # kaaf
+ "l": u"\u0644", # laam
+ "m": u"\u0645", # miim
+ "n": u"\u0646", # nuun
+ "h": u"\u0647", # haa'
+ "w": u"\u0648", # waaw
+ "ỳ": u"\u0649", # 'alif maqSuura
+ "y": u"\u064A", # yaa'
+ "á": u"\u064B", # fatHatayn
+ "ú": u"\u064C", # Dammatayn
+ "í": u"\u064D", # kasratayn
+ "a": u"\u064E", # fatHa
+ "u": u"\u064F", # Damma
+ "i": u"\u0650", # kasra
+ #"~": u"\u0651", # shaddah
+ "°": u"\u0652", # sukuun
+ #"`": u"\u0670", # dagger 'alif
+ #"{": u"\u0671", # waSla
+ ##extended here
+ #"^": u"\u0653", # Maddah
+ #"#": u"\u0654", # HamzaAbove
+
+ #":" : "\u06DC", # SmallHighSeen
+ #"@" : "\u06DF", # SmallHighRoundedZero
+ #"\"" : "\u06E0", # SmallHighUprightRectangularZero
+ #"[" : "\u06E2", # SmallHighMeemIsolatedForm
+ #";" : "\u06E3", # SmallLowSeen
+ #"," : "\u06E5", # SmallWaw
+ #"." : "\u06E6", # SmallYa
+ #"!" : "\u06E8", # SmallHighNoon
+ #"-" : "\u06EA", # EmptyCentreLowStop
+ #"+" : "\u06EB", # EmptyCentreHighStop
+ #"%" : "\u06EC", # RoundedHighStopWithFilledCentre
+ #"]" : "\u06ED" #
+
+ }
+
+# Available romanization systems
+ROMANIZATION_SYSTEMS_MAPPINGS = {
+ "buckwalter":BUCKWALTER2UNICODE,
+ "iso": ISO2UNICODE,
+ "arabtex": None,
+ }
+
+
+
+def guess_romanization_system():
+ """ @todo """
+ pass
+
+
+def transliterate( mode, string, ignore = "" , reverse = False ):
+ """ encode & decode different romanization systems """
+
+ if ROMANIZATION_SYSTEMS_MAPPINGS.has_key( mode ):
+ MAPPING = ROMANIZATION_SYSTEMS_MAPPINGS[mode]
+ else:
+ MAPPING = {}
+
+ if reverse:
+ mapping = {}
+ for k, v in MAPPING.items():
+ #reverse the mapping buckwalter <-> unicode
+ mapping[v] = k
+ else:
+ mapping = MAPPING
+
+ result = ""
+ for char in string :
+ if mapping.has_key( char ) and char not in ignore:
+ result += mapping[char]
+ else :
+ result += char
+ return result
+
=== modified file 'src/alfanous/Searching.py'
--- src/alfanous/Searching.py 2013-01-04 20:03:59 +0000
+++ src/alfanous/Searching.py 2013-01-31 11:45:44 +0000
@@ -20,11 +20,10 @@
'''
@author: assem
'''
-
from alfanous.Indexing import QseDocIndex
from alfanous.QueryProcessing import QuranicParser
from alfanous.ResultsProcessing import QSort, QScore
-from alfanous.Misc import buck2uni
+from alfanous.Romanization import transliterate
@@ -119,7 +118,7 @@
def search( self, querystr, limit = 6236, sortedby = "score", reverse = False ):
if ":" not in querystr:
- querystr = unicode( buck2uni( querystr, ignore = "'_\"%*?#~[]{}:>+-|" ) )
+ querystr = unicode( transliterate( "buckwalter", querystr, ignore = "'_\"%*?#~[]{}:>+-|" ) )
query = self._qparser.parse( querystr )
results = self.searcher.search( query, limit, QSort( sortedby ), reverse )