← Back to team overview

team4alfanous team mailing list archive

[Branch ~team4alfanous/alfanous/alfanous-git] Rev 546: Implement the ISO233 romanization

 

------------------------------------------------------------
revno: 546
committer: Assem Chelli <assem.ch@xxxxxxxxx>
timestamp: Thu 2013-01-31 12:45:44 +0100
message:
  Implement the ISO233 romanization
removed:
  src/alfanous-labs/Naq7ara/
  src/alfanous-labs/Naq7ara/Arabizi.py
  src/alfanous-labs/Naq7ara/Arabtex.py
  src/alfanous-labs/Naq7ara/Buckwalter.py
  src/alfanous-labs/Naq7ara/README.rst
  src/alfanous-labs/Naq7ara/__init__.py
added:
  src/alfanous/Romanization.py
modified:
  src/README.rst
  src/alfanous/Constants.py
  src/alfanous/Misc.py
  src/alfanous/Outputs.py
  src/alfanous/QueryProcessing.py
  src/alfanous/Searching.py


--
lp:alfanous
https://code.launchpad.net/~team4alfanous/alfanous/alfanous-git

Your team Alfanous team is subscribed to branch lp:alfanous.
To unsubscribe from this branch go to https://code.launchpad.net/~team4alfanous/alfanous/alfanous-git/+edit-subscription
=== modified file 'src/README.rst'
--- src/README.rst	2012-12-22 19:14:53 +0000
+++ src/README.rst	2013-01-31 11:45:44 +0000
@@ -33,6 +33,7 @@
 :Indexing.py: the manager of the indexes reading.
 :Searching.py: the module responsible of the basic search operation.
 :Suggestions.py: the module responsible of suggestions.
+:Romanization.py: the module responsible of romanization systems.
 :Threading.py: the module responsible of multi-processing during the search.
 :Constants.py: a module that contains some constants used in the API.
 :Exceptions.py: a module that contains some exceptions used in the API.

=== removed directory 'src/alfanous-labs/Naq7ara'
=== removed file 'src/alfanous-labs/Naq7ara/Arabizi.py'
=== removed file 'src/alfanous-labs/Naq7ara/Arabtex.py'
=== removed file 'src/alfanous-labs/Naq7ara/Buckwalter.py'
=== removed file 'src/alfanous-labs/Naq7ara/README.rst'
=== removed file 'src/alfanous-labs/Naq7ara/__init__.py'
=== modified file 'src/alfanous/Constants.py'
--- src/alfanous/Constants.py	2013-01-01 23:38:13 +0000
+++ src/alfanous/Constants.py	2013-01-31 11:45:44 +0000
@@ -19,80 +19,6 @@
 
 
 
-'''
-@author: Assem Chelli
-@contact: assem.ch [at] gmail.com
-@license: AGPL
-
-'''
-
-#:buckwalter code
-BUCKWALTER2UNICODE = {"'": u"\u0621", # hamza-on-the-line
-                "|": u"\u0622", # madda
-                ">": u"\u0623", # hamza-on-'alif
-                "&": u"\u0624", # hamza-on-waaw
-                "<": u"\u0625", # hamza-under-'alif
-                "}": u"\u0626", # hamza-on-yaa'
-                "A": u"\u0627", # bare 'alif
-                "b": u"\u0628", # baa'
-                "p": u"\u0629", # taa' marbuuTa
-                "t": u"\u062A", # taa'
-                "v": u"\u062B", # thaa'
-                "j": u"\u062C", # jiim
-                "H": u"\u062D", # Haa'
-                "x": u"\u062E", # khaa'
-                "d": u"\u062F", # daal
-                "*": u"\u0630", # dhaal
-                "r": u"\u0631", # raa'
-                "z": u"\u0632", # zaay
-                "s": u"\u0633", # siin
-                "$": u"\u0634", # shiin
-                "S": u"\u0635", # Saad
-                "D": u"\u0636", # Daad
-                "T": u"\u0637", # Taa'
-                "Z": u"\u0638", # Zaa' (DHaa')
-                "E": u"\u0639", # cayn
-                "g": u"\u063A", # ghayn
-                "_": u"\u0640", # taTwiil
-                "f": u"\u0641", # faa'
-                "q": u"\u0642", # qaaf
-                "k": u"\u0643", # kaaf
-                "l": u"\u0644", # laam
-                "m": u"\u0645", # miim
-                "n": u"\u0646", # nuun
-                "h": u"\u0647", # haa'
-                "w": u"\u0648", # waaw
-                "Y": u"\u0649", # 'alif maqSuura
-                "y": u"\u064A", # yaa'
-                "F": u"\u064B", # fatHatayn
-                "N": u"\u064C", # Dammatayn
-                "K": u"\u064D", # kasratayn
-                "a": u"\u064E", # fatHa
-                "u": u"\u064F", # Damma
-                "i": u"\u0650", # kasra
-                "~": u"\u0651", # shaddah
-                "o": u"\u0652", # sukuun
-                "`": u"\u0670", # dagger 'alif
-                "{": u"\u0671", # waSla
-                #extended here
-                "^": u"\u0653", # Maddah
-                "#": u"\u0654", # HamzaAbove
-
-                ":"  : "\u06DC", # SmallHighSeen
-                "@"  : "\u06DF", # SmallHighRoundedZero
-                "\"" : "\u06E0", # SmallHighUprightRectangularZero
-                "["  : "\u06E2", # SmallHighMeemIsolatedForm
-                ";"  : "\u06E3", # SmallLowSeen
-                ","  : "\u06E5", # SmallWaw
-                "."  : "\u06E6", # SmallYa
-                "!"  : "\u06E8", # SmallHighNoon
-                "-"  : "\u06EA", # EmptyCentreLowStop
-                "+"  : "\u06EB", # EmptyCentreHighStop
-                "%"  : "\u06EC", # RoundedHighStopWithFilledCentre
-                "]"  : "\u06ED"          #
-
-                }
-
 
 
 #:shaping table

=== modified file 'src/alfanous/Misc.py'
--- src/alfanous/Misc.py	2013-01-22 04:39:54 +0000
+++ src/alfanous/Misc.py	2013-01-31 11:45:44 +0000
@@ -6,9 +6,6 @@
 import sys
 import locale
 
-from  alfanous.Constants import BUCKWALTER2UNICODE
-
-
 #translation functions
 import gettext
 gettext.bindtextdomain( "fanous", "./locale" )
@@ -23,33 +20,9 @@
 #get platform
 SYS = sys.platform
 
-
-
-def buck2uni( string, ignore = "" , reverse = False ):
-	""" encode & decode buckwalter transliteration """
-
-	if reverse:
-		mapping = {}
-		for k, v in BUCKWALTER2UNICODE.items():
-			#reverse the mapping buckwalter <-> unicode
-			mapping[v] = k
-	else:
-		mapping = BUCKWALTER2UNICODE
-
-	result = ""
-	for char in string :
-		if mapping.has_key( char ) and char not in ignore:
-			result += mapping[char]
-		else :
-			result += char
-	return result
-
-
-
 FILTER_DOUBLES = filter_doubles = lambda lst:list( set( lst ) )
 LOCATE = lambda source, dist, itm: dist[source.index( itm )] \
 												if itm in source else None
-
 FIND = lambda source, dist, itm: [dist[i] for i in [i for i in range( len( source ) ) if source[i] == itm]]
 
 

=== modified file 'src/alfanous/Outputs.py'
--- src/alfanous/Outputs.py	2013-01-22 04:59:41 +0000
+++ src/alfanous/Outputs.py	2013-01-31 11:45:44 +0000
@@ -22,7 +22,7 @@
 TODO offer some linguistic operations like vocalize
 TODO derive using Quranic Corpus/functions
 FIXME use xranges in domains
-FIXME vocalization_dict[terms[1]], test key existance before use
+FIXME vocalization_dict[terms[1]], test key existence before use
 TODO include suggestions with search results
 TODO +flag language
 FIXME how to select the translation attached to results, what ID? 
@@ -43,7 +43,7 @@
 from alfanous.dynamic_resources.derivations_dyn import derivedict
 from alfanous.TextProcessing import QArabicSymbolsFilter
 from alfanous.Data import *
-from alfanous.Misc import buck2uni
+from alfanous.Romanization import transliterate
 from alfanous.Misc import LOCATE, FIND, FILTER_DOUBLES
 
 
@@ -396,6 +396,8 @@
 		""" return the results of search for any unit """
 		if unit == "aya":
 			search_results = self._search_aya( flags )
+		elif unit == "translation":
+			search_results = self._search_translation( flags )
 		else:
 			search_results = {}
 
@@ -593,7 +595,7 @@
 						derivations = []
 					words_output[ "individual" ][ cpt ] = {
 															 "word":term[1],
-															 "romanization": buck2uni( term[1], ignore = "" , reverse = True ) if romanization == "buckwalter" else None,
+															 "romanization": transliterate( romanization, term[1], ignore = "" , reverse = True ) if romanization in self.DOMAINS["romanization"] else None,
 															 "nb_matches":term[2],
 															 "nb_ayas":term[3],
 															 "nb_vocalizations": len( vocalizations ),#unneeded
@@ -609,7 +611,6 @@
 			annotation_word_query += u" ) "
 			words_output["global"] = {"nb_words":cpt - 1, "nb_matches":matches, "nb_vocalizations": nb_vocalizations_globale}
 		output["words"] = words_output;
-
 		#Magic_loop to built queries of Adjacents,translations and annotations in the same time
 		if prev_aya or next_aya or translation or  annotation_aya:
 			adja_query = trad_query = annotation_aya_query = u"( 0"

=== modified file 'src/alfanous/QueryProcessing.py'
--- src/alfanous/QueryProcessing.py	2013-01-22 04:39:54 +0000
+++ src/alfanous/QueryProcessing.py	2013-01-31 11:45:44 +0000
@@ -76,7 +76,6 @@
 from alfanous.TextProcessing import QArabicSymbolsFilter, unicode_
 
 from alfanous.Misc import LOCATE, FIND, FILTER_DOUBLES
-# from alfanous.Misc import buck2uni
 
 
 FEEDBACK = True

=== added file 'src/alfanous/Romanization.py'
--- src/alfanous/Romanization.py	1970-01-01 00:00:00 +0000
+++ src/alfanous/Romanization.py	2013-01-31 11:45:44 +0000
@@ -0,0 +1,191 @@
+# coding: utf-8
+
+
+##     Copyright (C) 2009-2012 Assem Chelli <assem.ch [at] gmail.com>
+
+##     This program is free software: you can redistribute it and/or modify
+##     it under the terms of the GNU Affero General Public License as published by
+##     the Free Software Foundation, either version 3 of the License, or
+##     (at your option) any later version.
+
+##     This program is distributed in the hope that it will be useful,
+##     but WITHOUT ANY WARRANTY; without even the implied warranty of
+##     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##     GNU Affero General Public License for more details.
+
+##     You should have received a copy of the GNU Affero General Public License
+##     along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+# Buckwalter Romanization  letters mapping
+BUCKWALTER2UNICODE = {"'": u"\u0621", # hamza-on-the-line
+                "|": u"\u0622", # madda
+                ">": u"\u0623", # hamza-on-'alif
+                "&": u"\u0624", # hamza-on-waaw
+                "<": u"\u0625", # hamza-under-'alif
+                "}": u"\u0626", # hamza-on-yaa'
+                "A": u"\u0627", # bare 'alif
+                "b": u"\u0628", # baa'
+                "p": u"\u0629", # taa' marbuuTa
+                "t": u"\u062A", # taa'
+                "v": u"\u062B", # thaa'
+                "j": u"\u062C", # jiim
+                "H": u"\u062D", # Haa'
+                "x": u"\u062E", # khaa'
+                "d": u"\u062F", # daal
+                "*": u"\u0630", # dhaal
+                "r": u"\u0631", # raa'
+                "z": u"\u0632", # zaay
+                "s": u"\u0633", # siin
+                "$": u"\u0634", # shiin
+                "S": u"\u0635", # Saad
+                "D": u"\u0636", # Daad
+                "T": u"\u0637", # Taa'
+                "Z": u"\u0638", # Zaa' (DHaa')
+                "E": u"\u0639", # cayn
+                "g": u"\u063A", # ghayn
+                "_": u"\u0640", # taTwiil
+                "f": u"\u0641", # faa'
+                "q": u"\u0642", # qaaf
+                "k": u"\u0643", # kaaf
+                "l": u"\u0644", # laam
+                "m": u"\u0645", # miim
+                "n": u"\u0646", # nuun
+                "h": u"\u0647", # haa'
+                "w": u"\u0648", # waaw
+                "Y": u"\u0649", # 'alif maqSuura
+                "y": u"\u064A", # yaa'
+                "F": u"\u064B", # fatHatayn
+                "N": u"\u064C", # Dammatayn
+                "K": u"\u064D", # kasratayn
+                "a": u"\u064E", # fatHa
+                "u": u"\u064F", # Damma
+                "i": u"\u0650", # kasra
+                "~": u"\u0651", # shaddah
+                "o": u"\u0652", # sukuun
+                "`": u"\u0670", # dagger 'alif
+                "{": u"\u0671", # waSla
+                #extended here
+                "^": u"\u0653", # Maddah
+                "#": u"\u0654", # HamzaAbove
+
+                ":"  : "\u06DC", # SmallHighSeen
+                "@"  : "\u06DF", # SmallHighRoundedZero
+                "\"" : "\u06E0", # SmallHighUprightRectangularZero
+                "["  : "\u06E2", # SmallHighMeemIsolatedForm
+                ";"  : "\u06E3", # SmallLowSeen
+                ","  : "\u06E5", # SmallWaw
+                "."  : "\u06E6", # SmallYa
+                "!"  : "\u06E8", # SmallHighNoon
+                "-"  : "\u06EA", # EmptyCentreLowStop
+                "+"  : "\u06EB", # EmptyCentreHighStop
+                "%"  : "\u06EC", # RoundedHighStopWithFilledCentre
+                "]"  : "\u06ED"          #
+
+                }
+
+# ISO233-2 romanization letter mapping
+ISO2UNICODE = { "ˌ": u"\u0621", # hamza-on-the-line
+                #"|": u"\u0622", # madda
+                "ˈ": u"\u0623", # hamza-on-'alif
+                "ˈ": u"\u0624", # hamza-on-waaw
+                #"<": u"\u0625", # hamza-under-'alif
+                "ˈ": u"\u0626", # hamza-on-yaa'
+                "ʾ": u"\u0627", # bare 'alif
+                "b": u"\u0628", # baa'
+                "ẗ": u"\u0629", # taa' marbuuTa
+                "t": u"\u062A", # taa'
+                "ṯ": u"\u062B", # thaa'
+                "ǧ": u"\u062C", # jiim
+                "ḥ": u"\u062D", # Haa'
+                "ẖ": u"\u062E", # khaa'
+                "d": u"\u062F", # daal
+                "ḏ": u"\u0630", # dhaal
+                "r": u"\u0631", # raa'
+                "z": u"\u0632", # zaay
+                "s": u"\u0633", # siin
+                "š": u"\u0634", # shiin
+                "ṣ": u"\u0635", # Saad
+                "ḍ": u"\u0636", # Daad
+                "ṭ": u"\u0637", # Taa'
+                "ẓ": u"\u0638", # Zaa' (DHaa')
+                "ʿ": u"\u0639", # cayn
+                "ġ": u"\u063A", # ghayn
+                #"_": u"\u0640", # taTwiil
+                "f": u"\u0641", # faa'
+                "q": u"\u0642", # qaaf
+                "k": u"\u0643", # kaaf
+                "l": u"\u0644", # laam
+                "m": u"\u0645", # miim
+                "n": u"\u0646", # nuun
+                "h": u"\u0647", # haa'
+                "w": u"\u0648", # waaw
+                "ỳ": u"\u0649", # 'alif maqSuura
+                "y": u"\u064A", # yaa'
+                "á": u"\u064B", # fatHatayn
+                "ú": u"\u064C", # Dammatayn
+                "í": u"\u064D", # kasratayn
+                "a": u"\u064E", # fatHa
+                "u": u"\u064F", # Damma
+                "i": u"\u0650", # kasra
+                #"~": u"\u0651", # shaddah
+                "°": u"\u0652", # sukuun
+                #"`": u"\u0670", # dagger 'alif
+                #"{": u"\u0671", # waSla
+                ##extended here
+                #"^": u"\u0653", # Maddah
+                #"#": u"\u0654", # HamzaAbove
+
+                #":"  : "\u06DC", # SmallHighSeen
+                #"@"  : "\u06DF", # SmallHighRoundedZero
+                #"\"" : "\u06E0", # SmallHighUprightRectangularZero
+                #"["  : "\u06E2", # SmallHighMeemIsolatedForm
+                #";"  : "\u06E3", # SmallLowSeen
+                #","  : "\u06E5", # SmallWaw
+                #"."  : "\u06E6", # SmallYa
+                #"!"  : "\u06E8", # SmallHighNoon
+                #"-"  : "\u06EA", # EmptyCentreLowStop
+                #"+"  : "\u06EB", # EmptyCentreHighStop
+                #"%"  : "\u06EC", # RoundedHighStopWithFilledCentre
+                #"]"  : "\u06ED"          #
+
+                }
+
+# Available romanization systems
+ROMANIZATION_SYSTEMS_MAPPINGS = {
+					"buckwalter":BUCKWALTER2UNICODE,
+					"iso": ISO2UNICODE,
+					"arabtex": None,
+					}
+
+
+
+def guess_romanization_system():
+	""" @todo """
+	pass
+
+
+def transliterate( mode, string, ignore = "" , reverse = False ):
+	""" encode & decode different  romanization systems """
+
+	if ROMANIZATION_SYSTEMS_MAPPINGS.has_key( mode ):
+		MAPPING = ROMANIZATION_SYSTEMS_MAPPINGS[mode]
+	else:
+		MAPPING = {}
+
+	if reverse:
+		mapping = {}
+		for k, v in MAPPING.items():
+			#reverse the mapping buckwalter <-> unicode
+			mapping[v] = k
+	else:
+		mapping = MAPPING
+
+	result = ""
+	for char in string :
+		if mapping.has_key( char ) and char not in ignore:
+			result += mapping[char]
+		else :
+			result += char
+	return result
+

=== modified file 'src/alfanous/Searching.py'
--- src/alfanous/Searching.py	2013-01-04 20:03:59 +0000
+++ src/alfanous/Searching.py	2013-01-31 11:45:44 +0000
@@ -20,11 +20,10 @@
 '''
 @author: assem
 '''
-
 from alfanous.Indexing import QseDocIndex
 from alfanous.QueryProcessing import QuranicParser
 from alfanous.ResultsProcessing import QSort, QScore
-from alfanous.Misc import buck2uni
+from alfanous.Romanization import transliterate
 
 
 
@@ -119,7 +118,7 @@
 
     def search( self, querystr, limit = 6236, sortedby = "score", reverse = False ):
         if ":" not in querystr:
-            querystr = unicode( buck2uni( querystr, ignore = "'_\"%*?#~[]{}:>+-|" ) )
+            querystr = unicode( transliterate( "buckwalter", querystr, ignore = "'_\"%*?#~[]{}:>+-|" ) )
 
         query = self._qparser.parse( querystr )
         results = self.searcher.search( query, limit, QSort( sortedby ), reverse )