File: //usr/share/ibus-table/engine/tabsqlitedb.py
# -*- coding: utf-8 -*-
# vim:et sts=4 sw=4
#
# ibus-table - The Tables engine for IBus
#
# Copyright (c) 2008-2009 Yu Yuwei <[email protected]>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# $Id: $
#
import os
import os.path as path
from sys import stderr
import sqlite3
import tabdict
import uuid
import time
import re
import chinese_variants
patt_r = re.compile(r'c([ea])(\d):(.*)')
patt_p = re.compile(r'p(-{0,1}\d)(-{0,1}\d)')
# first make some number index we will used :)
#(MLEN, CLEN, M0, M1, M2, M3, M4, PHRASE, FREQ, USER_FREQ) = range (0,10)
class tabsqlitedb:
'''Phrase database for tables'''
def __init__(self, name = 'table.db', user_db = None, filename = None ):
# use filename when you are creating db from source
# use name when you are using db
# first we use the Parse in tabdict, which transform the char(a,b,c,...) to int(1,2,3,...) to fasten the sql enquiry
self.parse = tabdict.parse
self.deparse = tabdict.deparse
self._add_phrase_sqlstr = ''
self.old_phrases=[]
self.ime_property_cache = {}
if filename:
# now we are creating db
self.db = sqlite3.connect( filename )
else:
try:
os.system('cat %s > /dev/null' % name)
except:
pass
# open system phrase db
self.db = sqlite3.connect( name )
try:
self.db.execute( 'PRAGMA page_size = 8192; ' )
self.db.execute( 'PRAGMA cache_size = 20000; ' )
# increase the cache size to speedup sqlite enquiry
self.db.execute( 'PRAGMA temp_store = MEMORY; ' )
self.db.execute( 'PRAGMA synchronous = OFF; ' )
except:
print 'encountering error when init db'
pass
# create IME property table
self.db.executescript('CREATE TABLE IF NOT EXISTS main.ime (attr TEXT, val TEXT);')
# make sure we have values in ime table.
if not self.db.execute('SELECT val FROM main.ime \
WHERE attr="name";').fetchall():
ime_keys={'name':'',
'name.zh_cn':'',
'name.zh_hk':'',
'name.zh_tw':'',
'author':'somebody',
'uuid':'%s' % uuid.uuid4(),
'serial_number':'%s' % time.strftime('%Y%m%d'),
'icon':'ibus-table.svg',
'license':'LGPL',
'languages':'',
'language_filter':'',
'valid_input_chars':'abcdefghijklmnopqrstuvwxyz',
'max_key_length':'4',
# 'commit_keys':'space',
# 'forward_keys':'Return',
'select_keys':'1,2,3,4,5,6,7,8,9,0',
'page_up_keys':'Page_Up,minus',
'page_down_keys':'Page_Down,equal',
'status_prompt':'',
'def_full_width_punct':'TRUE',
'def_full_width_letter':'FALSE',
'user_can_define_phrase':'FALSE',
'pinyin_mode':'FALSE',
'dynamic_adjust':'FALSE',
'auto_select':'false',
'auto_commit':'false',
#'no_check_chars':u'',
'description':'A IME under IBus Table',
'layout':'us',
'rules':'',
#'rules':'ce2:p11+p12+p21+p22;ce3:p11+p21+p22+p31;ca4:p11+p21+p31+p41'}
'least_commit_length':'0',
'start_chars':'',
'orientation':'1'
# we use this entry for those IME, which don't
# have rules to build up phrase, but still need
# auto commit to preedit
}
# inital the attribute in ime table, which should be updated from mabiao
for _name in ime_keys:
sqlstr = 'INSERT INTO main.ime (attr,val) VALUES (?,?);'
self.db.execute( sqlstr, (_name,ime_keys[_name]) )
# share variables in this class:
self._mlen = int ( self.get_ime_property ("max_key_length") )
# for chinese
self._is_chinese = self.is_chinese()
# for fast add word
self._set_add_phrase_sqlstr()
#(ID, MLEN, CLEN, M0, M1, M2, M3, M4, CATEGORY, PHRASE, FREQ, USER_FREQ) = range (0,12)
self._pt_index = ['id', 'mlen', 'clen']
for i in range(self._mlen):
self._pt_index.append ('m%d' %i)
if self._is_chinese:
self._pt_index += ['category']
self._pt_index += ['phrase','freq','user_freq']
self.user_can_define_phrase = self.get_ime_property('user_can_define_phrase')
if self.user_can_define_phrase:
if self.user_can_define_phrase.lower() == u'true' :
self.user_can_define_phrase = True
else:
self.user_can_define_phrase = False
else:
print 'Could not find "user_can_define_phrase" entry from database, is it a outdated database?'
self.user_can_define_phrase = False
self.dynamic_adjust = self.get_ime_property('dynamic_adjust')
if self.dynamic_adjust:
if self.dynamic_adjust.lower() == u'true' :
self.dynamic_adjust = True
else:
self.dynamic_adjust = False
else:
print 'Could not find "dynamic_adjust" entry from database, is it a outdated database?'
self.dynamic_adjust = False
self.rules = self.get_rules ()
self.pkeylens = []
self.pkeylens = self.phrase_keys_len ()
self.startchars = self.get_start_chars ()
#self._no_check_chars = self.get_no_check_chars()
# for fast gouci
self._goucima={}
if filename:
# since we just creating db, we do not need userdb and mudb
return
# user database:
if user_db != None:
home_path = os.getenv ("HOME")
tables_path = path.join (home_path, ".ibus", "tables")
user_db = path.join (tables_path, user_db)
if not path.isdir (tables_path):
os.makedirs (tables_path)
try:
desc = self.get_database_desc (user_db)
if desc == None :
self.init_user_db (user_db)
elif desc["version"] != "0.5":
new_name = "%s.%d" %(user_db, os.getpid())
print >> stderr, "Can not support the user db. We will rename it to %s" % new_name
self.old_phrases = self.extra_user_phrases( user_db )
os.rename (user_db, new_name)
self.init_user_db (user_db)
elif self.get_table_phrase_len(user_db) != len(self._pt_index):
print >> stderr, "user db format outdated."
# store old user phrases
self.old_phrases = self.extra_user_phrases( user_db )
new_name = "%s.%d" %(user_db, os.getpid())
os.rename (user_db, new_name)
self.init_user_db (user_db)
except:
import traceback
traceback.print_exc()
else:
user_db = ":memory:"
# open user phrase database
try:
self.db.execute ('ATTACH DATABASE "%s" AS user_db;' % user_db)
except:
print >> stderr, "The user database was damaged. We will recreate it!"
os.rename (user_db, "%s.%d" % (user_db, os.getpid ()))
self.init_user_db (user_db)
self.db.execute ('ATTACH DATABASE "%s" AS user_db;' % user_db)
self.create_tables ("user_db")
if self.old_phrases:
# (mlen, phrase, freq, user_freq)
# the phrases will be deparse again, and then be added
# the characters will be discard :(
#chars = filter (lambda x: x[0] == 1, self.old_phrases)
# print chars
phrases = filter (lambda x: x[0] > 1, self.old_phrases)
phrases = map(lambda x: [self.parse_phrase_to_tabkeys(x[1])]\
+ list(x[1:]) , phrases)
map (self.u_add_phrase,phrases)
self.db.commit ()
# try create all tables in user database
self.create_indexes ("user_db",commit=False)
self.generate_userdb_desc ()
# attach mudb for working process
mudb = ":memory:"
self.db.execute ('ATTACH DATABASE "%s" AS mudb;' % mudb )
self.create_tables ("mudb")
def update_phrase (self, entry, database='user_db'):
'''update phrase freqs'''
#print entry
_con = [ entry[-1] ] + list(entry[1:3+entry[1]]) + [entry[-3]]
#print _con
_condition = u''.join( map(lambda x: 'AND m%d = ? ' % x, range(entry[1]) ) )
#print _condition
sqlstr = 'UPDATE %s.phrases SET user_freq = ? WHERE mlen = ? AND clen = ? %s AND phrase = ?;' % (database, _condition)
#print sqlstr
self.db.execute ( sqlstr , _con )
# because we may update different db, we'd better commit every time.
self.db.commit()
def sync_usrdb (self):
# we need to update the user_db
#print 'sync userdb'
mudata = self.db.execute ('SELECT * FROM mudb.phrases;').fetchall()
#print mudata
data_u = filter ( lambda x: x[-2] in [1,-3], mudata)
data_a = filter ( lambda x: x[-2]==2, mudata)
data_n = filter ( lambda x: x[-2]==-2, mudata)
#print data_a
data_a = map (lambda x: (u''.join ( map(self.deparse, x[3:3+x[1]])),x[-3],0,x[-1] ), data_a)
data_n = map (lambda x: (u''.join ( map(self.deparse, x[3:3+x[1]])),x[-3],-1,x[-1] ), data_n)
#print data_u
map (self.update_phrase, data_u)
#print self.db.execute('select * from user_db.phrases;').fetchall()
map (self.u_add_phrase,data_a)
map (self.u_add_phrase,data_n)
self.db.commit ()
def is_chinese (self):
__lang = self.get_ime_property ('languages')
if __lang:
__langs = __lang.split(',')
for _l in __langs:
if _l.lower().find('zh') != -1:
return True
return False
def get_chinese_mode (self):
try:
__dict = {'cm0':0,'cm1':1,'cm2':2,'cm3':3,'cm4':4}
__filt = self.get_ime_property ('language_filter')
return __dict[__filt]
except:
return -1
def get_page_size (self):
return len(self.get_select_keys().split(','))
def get_select_keys (self):
ret = None
try:
ret = self.get_ime_property("select_keys")
finally:
if ret is None:
ret = "1,2,3,4,5,6,7,8,9,0"
return ret
def get_orientation (self):
try:
return int( self.get_ime_property ('orientation') )
except:
return 1
def create_tables (self, database):
'''Create tables that contain all phrase'''
try:
self.db.execute( 'PRAGMA cache_size = 20000; ' )
# increase the cache size to speedup sqlite enquiry
except:
pass
if database == 'main':
# create ikeys table
sqlstr = 'CREATE TABLE IF NOT EXISTS %s.ikeys (ikey TEXT PRIMARY KEY, id INTEGER);' % database
self.db.execute ( sqlstr )
# create goucima table, this table is used in construct new phrases
sqlstr = 'CREATE TABLE IF NOT EXISTS %s.goucima (zi TEXT PRIMARY KEY' % database
for i in range(self._mlen):
sqlstr += ', g%d INTEGER' % i
#sqlstr += ''.join(map (lambda x: ', g%d INTEGER' % x, range(self._mlen)) )
sqlstr += ');'
self.db.execute ( sqlstr )
# create pinyin table, this table is used in search single character for user handly
sqlstr = 'CREATE TABLE IF NOT EXISTS %s.pinyin ( plen INTEGER, ' % database
#for i in range(6):
# sqlstr += 'p%d INTEGER, ' % i
sqlstr += ''.join( map (lambda x: 'p%d INTEGER, ' % x, range(7) ) )
sqlstr += 'zi TEXT, freq INTEGER);'
self.db.execute ( sqlstr )
# create phrase table (mabiao)
sqlstr = 'CREATE TABLE IF NOT EXISTS %s.phrases (id INTEGER PRIMARY KEY AUTOINCREMENT,\
mlen INTEGER, clen INTEGER, ' % database
#for i in range(self._mlen):
# sqlstr += 'm%d INTEGER, ' % i
sqlstr += ''.join ( map (lambda x: 'm%d INTEGER, ' % x, range(self._mlen)) )
if self._is_chinese:
sqlstr += 'category INTEGER, '
sqlstr += 'phrase TEXT, freq INTEGER, user_freq INTEGER);'
self.db.execute ( sqlstr )
self.db.commit()
def update_ime (self, attrs):
'''Update attributes in ime table, attrs is a iterable object
Like [(attr,val), (attr,val), ...]
'''
sqlstr = 'UPDATE main.ime SET val = ? WHERE attr = ?;'
for attr,val in attrs:
_sqlstr = 'SELECT * from main.ime WHERE attr = ?'
res = self.db.execute( _sqlstr, (attr,) ).fetchall()
if res:
self.db.execute(sqlstr,(val,attr))
else:
#print '"',attr,'"'," didn't in ime property now!"
pass
# then flush previous cache
self.ime_property_cache = {}
# we need to update some self variables now.
self._mlen = int (self.get_ime_property ('max_key_length' ))
self._is_chinese = self.is_chinese()
self._set_add_phrase_sqlstr()
self._pt_index = ['id', 'mlen', 'clen']
for i in range(self._mlen):
self._pt_index.append ('m%d' %i)
if self._is_chinese:
self._pt_index += ['category']
self._pt_index += ['phrase','freq','user_freq']
self.user_can_define_phrase = self.get_ime_property('user_can_define_phrase')
if self.user_can_define_phrase:
if self.user_can_define_phrase.lower() == u'true' :
self.user_can_define_phrase = True
else:
self.user_can_define_phrase = False
else:
print 'Could not find "user_can_define_phrase" entry from database, is it a outdated database?'
self.user_can_define_phrase = False
self.rules = self.get_rules ()
self.db.commit()
def get_rules (self):
'''Get phrase construct rules'''
rules={}
if self.user_can_define_phrase:
try:
_rules = self.get_ime_property ('rules')
if _rules:
_rules = _rules.strip().split(';')
for rule in _rules:
res = patt_r.match (rule)
if res:
cms = []
if res.group(1) == 'a':
rules['above'] = int(res.group(2))
_cms = res.group(3).split('+')
if len(_cms) > self._mlen:
print 'rule: "%s" over max key length' % rule
break
for _cm in _cms:
cm_res = patt_p.match(_cm)
cms.append(( int(cm_res.group(1)),int(cm_res.group(2)) ))
rules[int(res.group(2))]=cms
else:
print 'not a legal rule: "%s"' % rule
except Exception:
import traceback
traceback.print_exc ()
return rules
else:
return ""
def phrase_keys_len (self):
'''Return the phrase possible key length'''
if self.rules:
max_len = self.rules["above"]
try:
return map ( lambda x: len(self.rules[x]), range(2, max_len+1) )[:]
except:
return []
else:
# we try to findout whether we have least commit len
# in ime table
try:
least_commit_len = int(self.get_ime_property('least_commit_length'))
except:
least_commit_len = 0
if least_commit_len > 0:
return range (least_commit_len, self._mlen + 1)
else:
return []
def get_start_chars (self):
'''return possible start chars of IME'''
try:
return self.get_ime_property('start_chars')
except:
return ''
def get_no_check_chars (self):
'''Get the characters which engine should not change freq'''
_chars= self.get_ime_property('no_check_chars')
try:
_chars = _chars.decode('utf-8')
except:
pass
return _chars
def add_phrases (self, phrases, database = 'main'):
'''Add phrases to database, phrases is a iterable object
Like: [(tabkeys, phrase, freq ,user_freq), (tabkeys, phrase, freq, user_freq), ...]
'''
map (self.add_phrase, phrases, [database]*len(phrases),[False]*len(phrases) )
self.db.commit()
def add_new_phrases (self, nphrases, database='main'):
'''Add new phrases into db, new phrases is a object
of [(phrase,freq), (phrase,freq),...]'''
n_phrases=[]
for _ph, _freq in nphrases:
try:
_tabkey = self.parse_phrase_to_tabkeys(_ph)
if not self.check_phrase_internal (_ph, _tabkey, database):
# we don't have this phrase
n_phrases.append ( (_tabkey, _ph, _freq, 0) )
except:
print '\"%s\" would not been added' % _ph
if n_phrases:
self.add_phrases ( n_phrases, database )
def u_add_phrase (self,nphrase):
'''Add a phrase to userdb'''
self.add_phrase (nphrase,database='user_db',commit=False)
def _set_add_phrase_sqlstr(self):
'''Create the sqlstr for add phrase according to self._mlen.'''
sqlstr = 'INSERT INTO %s.phrases ( mlen, clen, '
sql_suffix = 'VALUES ( ?, ?, '
mmlen = range(self._mlen)
sqlstr += ''.join ( map(lambda x: 'm%d, ' %x , mmlen) )
sql_suffix += ''.join ( map (lambda x: '?, ' , mmlen) )
if self._is_chinese:
sqlstr += 'category, '
sql_suffix += '?, '
sqlstr += 'phrase, freq, user_freq) '
sql_suffix += '?, ?, ? );'
sqlstr += sql_suffix
self._add_phrase_sqlstr = sqlstr
def add_phrase (self, aphrase, database = 'main',commit=True):
'''Add phrase to database, phrase is a object of
(tabkeys, phrase, freq ,user_freq)
'''
sqlstr = self._add_phrase_sqlstr
try:
tabkeys,phrase,freq,user_freq = aphrase
except:
tabkeys,phrase,freq = aphrase
user_freq = 0
# now we will set the category bits if this is chinese
if self._is_chinese:
category = chinese_variants.detect_chinese_category(phrase)
try:
tbks = self.parse(tabkeys)
if len(tbks) != len(tabkeys):
print 'In %s %s: we parse tabkeys fail' % (phrase, tabkeys )
return
record = [None] * (5 + self._mlen)
record [0] = len (tabkeys)
record [1] = len (phrase)
record [2: 2+len(tabkeys)] = map (lambda x: tbks[x].get_key_id(), range(0,len(tabkeys)))
if self._is_chinese:
record +=[None]
record[-4] = category
record[-3:] = phrase, freq, user_freq
self.db.execute (sqlstr % database, record)
if commit:
self.db.commit()
except Exception:
import traceback
traceback.print_exc()
def add_goucima (self, gcms):
'''Add goucima into database, gcms is iterable object
Like gcms = [(zi,goucima),(zi,goucima), ...]
'''
count = 1
for zi,gcm in gcms:
_con = ''
_val = ''
_len = min ( len(gcm),self._mlen)
for i in range( _len ):
_con += ', g%d' % i
_val += ', ?'
sqlstr = '''INSERT INTO main.goucima ( zi %s )
VALUES ( ? %s );''' % (_con, _val)
try:
gc = self.parse(gcm)
if len(gc) != len(gcm):
error_m = u'%s %s: Can not parse goucima' % (zi, gcm )
raise Exception ( error_m.encode ('utf8') )
record = [zi]
for i in range(_len):
record.append( gc[i].get_key_id())
self.db.execute (sqlstr , record)
except Exception:
import traceback
traceback.print_exc()
count += 1
self.db.commit()
def add_pinyin (self, pinyins, database = 'main'):
'''Add pinyin to database, pinyins is a iterable object
Like: [(zi,pinyin, freq), (zi, pinyin, freq), ...]
'''
sqlstr = 'INSERT INTO %s.pinyin ( plen, '
sql_suffix = 'VALUES ( ?, '
for i in range(7):
sqlstr += 'p%d, ' % i
sql_suffix += '?, '
sqlstr += 'zi, freq ) '
sql_suffix += '?, ? );'
sqlstr += sql_suffix
count = 1
for pinyin,zi,freq in pinyins:
try:
pinyin_n = pinyin.replace('1','!').replace('2','@').replace('3','#').replace('4','$').replace('5','%')
py = self.parse(pinyin_n)
if len(py) != len(pinyin_n):
error_m = u'%s %s: Can not parse pinyin' % (zi, pinyin )
raise Exception ( error_m.encode ('utf8') )
record = [None]*10
record [0] = len (pinyin_n)
for i in range(0,len(pinyin_n)):
record [ 1+i ] = py[i].get_key_id()
record [-2] = zi
record [-1] = freq
self.db.execute (sqlstr % database, record)
except Exception:
print count, ': ', zi.encode('utf8'), ' ', pinyin
import traceback
traceback.print_exc()
count += 1
self.db.commit()
def optimize_database (self, database='main'):
sqlstr = '''
CREATE TABLE tmp AS SELECT * FROM %(database)s.phrases;
DELETE FROM %(database)s.phrases;
INSERT INTO %(database)s.phrases SELECT * FROM tmp ORDER BY
%(tabkeystr)s mlen ASC, user_freq DESC, freq DESC, id ASC;
DROP TABLE tmp;
CREATE TABLE tmp AS SELECT * FROM %(database)s.goucima;
DELETE FROM %(database)s.goucima;
INSERT INTO %(database)s.goucima SELECT * FROM tmp ORDER BY zi,g0,g1;
DROP TABLE tmp;
CREATE TABLE tmp AS SELECT * FROM %(database)s.pinyin;
DELETE FROM %(database)s.pinyin;
INSERT INTO %(database)s.pinyin SELECT * FROM tmp ORDER BY p0,p1,p2,p3,p4,p5,plen ASC;
DROP TABLE tmp;
'''
tabkeystr = ''
for i in range(self._mlen):
tabkeystr +='m%d, ' % i
self.db.executescript (sqlstr % {'database':database,'tabkeystr':tabkeystr })
self.db.executescript ("VACUUM;")
self.db.commit()
def drop_indexes(self, database):
'''Drop the index in database to reduce it's size'''
sqlstr = '''
DROP INDEX IF EXISTS %(database)s.goucima_index_z;
DROP INDEX IF EXISTS %(database)s.pinyin_index_i;
DROP INDEX IF EXISTS %(database)s.phrases_index_p;
DROP INDEX IF EXISTS %(database)s.phrases_index_i;
VACUUM;
''' % { 'database':database }
self.db.executescript (sqlstr)
self.db.commit()
def create_indexes(self, database, commit=True):
sqlstr = '''
CREATE INDEX IF NOT EXISTS %(database)s.goucima_index_z ON goucima (zi);
CREATE INDEX IF NOT EXISTS %(database)s.pinyin_index_i ON pinyin (p0,p1,p2,p3,p4,p5,plen ASC, freq DESC);
''' % { 'database':database }
sqlstr_t = '''
CREATE INDEX IF NOT EXISTS %(database)s.phrases_index_p ON phrases
(%(tabkeystr)s mlen ASC, freq DESC, id ASC);
CREATE INDEX IF NOT EXISTS %(database)s.phrases_index_i ON phrases (phrase, mlen ASC);
'''
tabkeystr = ''
for i in range(self._mlen):
tabkeystr +='m%d,' % i
if database == 'main':
sqlstr = sqlstr_t % {'database':database,'tabkeystr':tabkeystr } + sqlstr
else:
sqlstr = sqlstr_t % {'database':database,'tabkeystr':tabkeystr }
self.db.executescript (sqlstr)
if commit:
self.db.commit()
def compare (self,x,y):
return cmp (x[1], y[1]) or -(cmp (x[-1], y[-1])) \
or -(cmp (x[-2], y[-2])) or (cmp (x[0], y[0]))
def select_words( self, tabkeys, onechar=False, bitmask=0 ):
'''
Get phrases from database by tab_key objects
( which should be equal or less than the max key length)
This method is called in table.py by passing UserInput held data
Return result[:]
'''
# firstly, we make sure the len we used is equal or less than the max key length
_len = min( len(tabkeys),self._mlen )
_condition = ''
_condition += ''.join ( map (lambda x: 'AND m%d = ? ' %x, range(_len) ) )
if onechar:
# for some users really like to select only single characters
_condition += 'AND clen=1 '
if bitmask:
# now just the bits for chinese
all_ints = xrange(1,5)
need_ints = filter (lambda x: x & bitmask, all_ints)
bit_condition = 'OR'.join( map(lambda x: ' category = %d ' %x,\
need_ints) )
_condition += 'AND (%s) ' % bit_condition
# you can increase the x in _len + x to include more result, but in the most case, we only need one more key result, so we don't need the extra overhead :)
# we start search for 1 key more, if nothing, then 2 key more and so on
# this is the max len we need to add into the select cause.
w_len = self._mlen - _len +1
# we start from 2, because it is < in the sqlite select, which need 1 more.
x_len = 2
while x_len <= w_len + 1:
sqlstr = '''SELECT * FROM (SELECT * FROM main.phrases WHERE mlen < %(mk)d %(condition)s
UNION ALL
SELECT * FROM user_db.phrases WHERE mlen < %(mk)d %(condition)s
UNION ALL
SELECT * FROM mudb.phrases WHERE mlen < %(mk)d %(condition)s )
ORDER BY mlen ASC, user_freq DESC, freq DESC, id ASC;''' % { 'mk':_len+x_len, 'condition':_condition}
# we have redefine the __int__(self) in class tabdict.tab_key to return the key id, so we can use map to got key id :)
_tabkeys = map(int,tabkeys[:_len])
_tabkeys += _tabkeys + _tabkeys
result = self.db.execute(sqlstr, _tabkeys).fetchall()
#self.db.commit()
# if we find word, we stop this while,
if len(result) >0:
break
x_len += 1
# Remove duplicated entries other than different id,freq,user_freq
box_uniq = set()
def add2box(x):
box_uniq.add(x[1:-2])
return x
result = [add2box(x) for x in result if x[1:-2] not in box_uniq]
return result[:]
def select_zi( self, tabkeys ):
'''
Get zi from database by tab_key objects
( which should be equal or less than 6)
This method is called in table.py by passing UserInput held data
Return result[:]
'''
# firstly, we make sure the len we used is equal or less than
# the max pinyin length 7 (include tune[1-5])
_len = min( len(tabkeys), 7 )
_condition = ''
#for i in range(_len):
# _condition += 'AND p%d = ? ' % i
_condition += ''.join ( map (lambda x: 'AND p%d = ? ' %x, range(_len)) )
# you can increase the x in _len + x to include more result, but in the most case, we only need one more key result, so we don't need the extra overhead :)
# here we need make sure that the 3 <= plen <=7,
# so , if if _len < 3, than we start from 3;
# if _len >= 3, we start from _len + 1;
# if _len = 7, we start from _len
if _len < 7:
if _len < 3:
x_len = 3
else:
x_len = _len + 1
else:
x_len = _len
while x_len <= 8:
sqlstr = '''SELECT * FROM main.pinyin WHERE plen < %(mk)d %(condition)s
ORDER BY plen ASC, freq DESC;''' % { 'mk':x_len, 'condition':_condition}
# we have redefine the __int__(self) in class tabdict.tab_key to return the key id, so we can use map to got key id :)
_tabkeys = map(int,tabkeys[:_len])
result = self.db.execute(sqlstr, _tabkeys).fetchall()
if len(result) > 0:
break
x_len += 1
#self.db.commit()
return result[:]
def get_ime_property( self, attr ):
'''get IME property from database, attr is the string of property,
which should be str.lower() :)
'''
if not attr in self.ime_property_cache:
sqlstr = 'SELECT val FROM main.ime WHERE attr = ?'
_result = self.db.execute( sqlstr, (attr,)).fetchall()
if _result:
self.ime_property_cache[attr] = _result[0][0]
else:
self.ime_property_cache[attr] = None
return self.ime_property_cache[attr]
def get_phrase_table_index (self):
'''get a list of phrase table columns name'''
return self._pt_index[:]
def generate_userdb_desc (self):
try:
sqlstring = 'CREATE TABLE IF NOT EXISTS user_db.desc (name PRIMARY KEY, value);'
self.db.executescript (sqlstring)
sqlstring = 'INSERT OR IGNORE INTO user_db.desc VALUES (?, ?);'
self.db.execute (sqlstring, ('version', '0.5'))
sqlstring = 'INSERT OR IGNORE INTO user_db.desc VALUES (?, DATETIME("now", "localtime"));'
self.db.execute (sqlstring, ("create-time", ))
self.db.commit ()
except:
import traceback
traceback.print_exc ()
def init_user_db (self,db_file):
if not path.exists (db_file):
db = sqlite3.connect (db_file)
db.execute('PRAGMA page_size = 4096;')
db.execute( 'PRAGMA cache_size = 20000;' )
db.execute( 'PRAGMA temp_store = MEMORY; ' )
db.execute( 'PRAGMA synchronous = OFF; ' )
db.commit()
def get_database_desc (self, db_file):
if not path.exists (db_file):
return None
try:
db = sqlite3.connect (db_file)
db.execute('PRAGMA page_size = 4096;')
db.execute( 'PRAGMA cache_size = 20000;' )
db.execute( 'PRAGMA temp_store = MEMORY; ' )
db.execute( 'PRAGMA synchronous = OFF; ' )
desc = {}
for row in db.execute ("SELECT * FROM desc;").fetchall():
desc [row[0]] = row[1]
self.db.commit()
return desc
except:
return None
def get_table_phrase_len(self, db_file):
table_patt = re.compile(r'.*\((.*)\)')
if not path.exists (db_file):
return 0
try:
db = sqlite3.connect (db_file)
tp_res = db.execute("select sql from sqlite_master\
where name='phrases';").fetchall()
self.db.commit()
res = table_patt.match(tp_res[0][0])
if res:
tp = res.group(1).split(',')
return len(tp)
else:
return 0
except:
return 0
def cache_goucima (self):
self._goucima = {}
goucima = self.db.execute('SELECT * FROM main.goucima;').fetchall()
map(lambda x: self._goucima.update({x[0]:x[1:]}), goucima)
def get_gcm_id (self, zi):
'''Get goucima of given character'''
if self._goucima:
# we already cache the goucima
if not isinstance(zi,unicode):
zi = zi.decode('utf-8')
try:
gcds = self._goucima[zi]
return gcds
except:
pass
sqlstr = 'SELECT %s FROM main.goucima WHERE zi =?;' % ','.join( map (lambda x: 'g%d' % x, range(self._mlen) ) )
return self.db.execute(sqlstr,(zi,)).fetchall()[0]
def parse_phrase (self, phrase):
'''Parse phrase to get its Table code'''
# first we make sure that we are parsing unicode string
try:
phrase = unicode(phrase)
except:
phrase = phrase.decode('utf8')
p_len = len(phrase)
tabkeylist = []
if p_len < 2:
# phrase should not be shorter than 2
return []
#print p_len
try:
if p_len >= self.rules['above']:
rule = self.rules[ self.rules['above'] ]
elif p_len in self.rules:
rule = self.rules[p_len]
else:
raise Exception ('unsupport len of phrase')
if len(rule) > self._mlen:
raise Exception ('fault rule: %s' % rule)
#for (zi,ma) in rule:
# if zi > 0:
# zi -= 1
# gcm = self.get_gcm_id (phrase[zi])
# tabkeylist.append(gcm[ma-1])
tabkeylist = map (lambda x: self.get_gcm_id ( phrase[x[0]-1 if x[0] > 0 else x[0]] )\
[ x[1]-1 if x[1] > 0 else x[1] ], rule )
return [len( tabkeylist)] + [p_len] + tabkeylist[:] + [phrase]
except:
print "pharse pharse \"%s\" fail." % phrase.encode("utf-8")
#import traceback
#traceback.print_exc ()
def parse_phrase_to_tabkeys (self,phrase):
'''Get the Table encoding of the phrase in string form'''
try:
tabres = self.parse_phrase (phrase) [2:-1]
except:
tabres = None
if tabres:
tabkeys= u''.join ( map(self.deparse, tabres) )
else:
tabkeys= u''
return tabkeys
def check_phrase (self,phrase,tabkey=None,database='main'):
# if IME didn't support user define phrase,
# we divide user input phrase into characters,
# and then check its frequence
if type(phrase) != type(u''):
phrase = phrase.decode('utf8')
if self.user_can_define_phrase:
self.check_phrase_internal (phrase, tabkey, database)
else:
map(self.check_phrase_internal, phrase)
def check_phrase_internal (self,phrase,tabkey=None,database='main'):
'''Check word freq and user_freq
'''
if type(phrase) != type(u''):
phrase = phrase.decode('utf8')
if self._is_chinese:
if phrase in tabdict.chinese_nocheck_chars:
return
if len(phrase) >=2:
try:
wordattr = self.parse_phrase ( phrase )
_len = len (wordattr) -3
except:
# if we don't have goucima:
return
if (not tabkey) or len(tabkey) > self._mlen :
sqlstr = '''SELECT * FROM (SELECT * FROM main.phrases WHERE phrase = ?
UNION ALL SELECT * FROM user_db.phrases WHERE phrase = ?
UNION ALL SELECT * FROM mudb.phrases WHERE phrase = ?)
ORDER BY user_freq DESC, freq DESC, id ASC;
'''
result = self.db.execute(sqlstr, (phrase,phrase,phrase)).fetchall()
else:
# we are using this to check whether the tab-key and phrase is in db
#print "tabkey: ", tabkey
tabks = self.parse (tabkey)
#print "tabks: ", tabks
tabkids = tuple( map(int,tabks) )
condition = ' and '.join( map(lambda x: 'm%d = ?' % x, range( len(tabks) )) )
sqlstr = '''SELECT * FROM
(
SELECT * FROM main.phrases WHERE phrase = ? and %(cond)s
UNION ALL SELECT * FROM user_db.phrases WHERE phrase = ? and %(cond)s
UNION ALL SELECT * FROM mudb.phrases WHERE phrase = ? and %(cond)s
)
ORDER BY user_freq DESC, freq DESC, id ASC;
''' % {'cond':condition}
#print sqlstr
result = self.db.execute(sqlstr, ((phrase,)+tabkids)*3 ).fetchall()
if not bool(result):
sqlstr = '''SELECT * FROM (SELECT * FROM main.phrases WHERE phrase = ?
UNION ALL SELECT * FROM user_db.phrases WHERE phrase = ?
UNION ALL SELECT * FROM mudb.phrases WHERE phrase = ?)
ORDER BY user_freq DESC, freq DESC, id ASC;
'''
result = self.db.execute(sqlstr, (phrase,phrase,phrase)).fetchall()
sysdb = {}
usrdb = {}
mudb = {}
#print "result is: ", result
searchres = map ( lambda res: [ int(res[-2]), int(res[-1]),
[(res[1:-2],[res[:-1],res[-1]])] ], result)
# for sysdb
reslist=filter( lambda x: not x[1], searchres )
map (lambda x: sysdb.update(x[2]), reslist)
#print "sysdb is ", sysdb
# for usrdb
reslist=filter( lambda x: ( x[0] in [0,-1] ) and x[1], searchres )
map (lambda x: usrdb.update(x[2]), reslist)
#print "usrdb is ", usrdb
# for mudb
reslist=filter( lambda x: (x[0] not in [0,-1]) and x[1], searchres )
map (lambda x: mudb.update(x[2]), reslist)
#print "mudb is ", mudb
tabkey = ''
if len(phrase) >=2:
tabkey = u''.join ( map(self.deparse,wordattr[2:2+_len]) )
#for k in wordattr[2:2+_len]:
# tabkey += self.deparse (k)
if self._is_chinese:
sqlstr = 'UPDATE mudb.phrases SET user_freq = ? WHERE mlen = ? AND clen = ? %s AND category = ? AND phrase = ?;'
else:
sqlstr = 'UPDATE mudb.phrases SET user_freq = ? WHERE mlen = ? AND clen = ? %s AND phrase = ?;'
try:
if len(phrase) == 1:
# this is a character
if not self.dynamic_adjust:
# we should change the frequency of words
return
# we remove the keys contained in mudb from usrdb
keyout = filter (lambda k: mudb.has_key(k), usrdb.keys() )
map (usrdb.pop, keyout)
# we remove the keys contained in mudb and usrdb from sysdb
keyout = filter (lambda k: mudb.has_key(k) or usrdb.has_key(k) , sysdb.keys() )
map (sysdb.pop, keyout)
# first mudb
map (lambda res: self.db.execute ( sqlstr % ''.join( map(lambda x: 'AND m%d = ? ' % x, range(res[0])) ) , [ mudb[res][1] + 1 ] + list( res[:2+res[0]]) + list (res[2+self._mlen:]) ) , mudb.keys())
self.db.commit()
# -----original for loop of above map:
#for res in mudb.keys ():
# _con = [ mudb[res][1] + 1 ] + list( res[:2+res[0]]) + list (res[2+self._mlen:])
# _condition = ''.join( map(lambda x: 'AND m%d = ? ' % x, range(res[0])) )
# self.db.execute ( sqlstr % _condition, _con )
# then usrdb
map ( lambda res: self.add_phrase ( (''.join ( map(self.deparse,res[2:2+int(res[0])] ) ),phrase,1,usrdb[res][1]+1 ), database = 'mudb') , usrdb.keys() )
# -----original for loop of above map:
#for res in usrdb.keys ():
# #if mudb.has_key (res):
# # continue
# tabkey = ''.join ( map(self.deparse,res[2:2+int(res[0])] ) )
# # here we use freq 1 to denote the phrase needed update in user_db
# self.add_phrase ((tabkey,phrase,1,usrdb[res][1]+1 ), database = 'mudb')
# last sysdb
map ( lambda res: self.add_phrase ( ( ''.join ( map(self.deparse,res[2:2+int(res[0])]) ),phrase,2,1 ), database = 'mudb'), sysdb.keys() )
# -----original for loop of above map:
#for res in sysdb.keys ():
# tabkey = ''.join ( map(self.deparse,res[2:2+int(res[0])]) )
# # here we use freq 2 to denote the word needed addition to user_db
# self.add_phrase ((tabkey,phrase,2,1), database = 'mudb')
else:
# this is a phrase
if len (result) == 0 and self.user_can_define_phrase:
# this is a new phrase, we add it into user_db
self.add_phrase ( (tabkey,phrase,-2,1), database = 'mudb')
elif len (result) > 0:
if not self.dynamic_adjust:
# we should change the frequency of words
return
# we remove the keys contained in mudb from usrdb
keyout = filter (lambda k: mudb.has_key(k), usrdb.keys() )
map (usrdb.pop, keyout)
# we remove the keys contained in mudb and usrdb from sysdb
keyout = filter (lambda k: mudb.has_key(k) or usrdb.has_key(k) , sysdb.keys() )
map (sysdb.pop, keyout)
# first we process mudb
# the original for loop can be found above in 'len==1'
map (lambda res: self.db.execute ( sqlstr % ''.join( map(lambda x: 'AND m%d = ? ' % x, range(res[0])) ) , [ mudb[res][1] + 1 ] + list( res[:2+res[0]]) + list (res[2+self._mlen:]) ) , mudb.keys())
self.db.commit()
# then usrdb
map ( lambda res: self.add_phrase ( (''.join ( map(self.deparse,res[2:2+int(res[0])] ) ),phrase,(-3 if usrdb[res][0][-1] == -1 else 1),usrdb[res][1]+1 ), database = 'mudb') , usrdb.keys() )
# last sysdb
map ( lambda res: self.add_phrase ( ( ''.join ( map(self.deparse,res[2:2+int(res[0])]) ),phrase,2,1 ), database = 'mudb'), sysdb.keys() )
else:
# we come to here when the ime dosen't support user phrase define
pass
#self.db.commit()
except:
import traceback
traceback.print_exc ()
def find_zi_code (self,zi):
'''Check word freq and user_freq
'''
zi = zi.decode('utf8')
sqlstr = '''SELECT * FROM main.phrases WHERE phrase = ?
ORDER BY mlen ASC;
'''
result = self.db.execute(sqlstr, (zi,)).fetchall()
#self.db.commit()
codes = []
try:
if result:
for _res in result:
tabkey = u''
for i in range ( int ( _res[1] ) ):
tabkey += self.deparse ( _res[3+i] )
codes.append(tabkey)
except:
import traceback
traceback.print_exc ()
return codes[:]
def remove_phrase (self,phrase,database='user_db'):
'''Remove phrase from database, default is from user_db
phrase should be the a row of select * result from database
Like (id, mlen,clen,m0,m1,m2,m3,phrase,freq,user_freq)
'''
_ph = list(phrase[:-2])
_condition = ''
for i in range(_ph[1]):
_condition += 'AND m%d = ? ' % i
nn =_ph.count(None)
if nn:
for i in range(nn):
_ph.remove(None)
if self._is_chinese:
msqlstr= 'SELECT * FROM %(database)s.phrases WHERE mlen = ? and clen = ? %(condition)s AND category = ? AND phrase = ? ;' % { 'database':database, 'condition':_condition }
else:
msqlstr= 'SELECT * FROM %(database)s.phrases WHERE mlen = ? and clen = ? %(condition)s AND phrase = ? ;' % { 'database':database, 'condition':_condition }
if self.db.execute(msqlstr, _ph[1:]).fetchall():
if self._is_chinese:
sqlstr = 'DELETE FROM %(database)s.phrases WHERE mlen = ? AND clen =? %(condition)s AND category = ? AND phrase = ? ;' % { 'database':database, 'condition':_condition }
else:
sqlstr = 'DELETE FROM %(database)s.phrases WHERE mlen = ? AND clen =? %(condition)s AND phrase = ? ;' % { 'database':database, 'condition':_condition }
self.db.execute(sqlstr,_ph[1:])
self.db.commit()
if self._is_chinese:
msqlstr= 'SELECT * FROM mudb.phrases WHERE mlen = ? and clen = ? %(condition)s AND category = ? AND phrase = ? ;' % { 'condition':_condition }
else:
msqlstr= 'SELECT * FROM mudb.phrases WHERE mlen = ? and clen = ? %(condition)s AND phrase = ? ;' % { 'condition':_condition }
if self.db.execute(msqlstr, _ph[1:]).fetchall():
if self._is_chinese:
sqlstr = 'DELETE FROM mudb.phrases WHERE mlen = ? AND clen =? %(condition)s AND category = ? AND phrase = ? ;' % { 'condition':_condition }
else:
sqlstr = 'DELETE FROM mudb.phrases WHERE mlen = ? AND clen =? %(condition)s AND phrase = ? ;' % { 'condition':_condition }
self.db.execute(sqlstr,_ph[1:])
self.db.commit()
def extra_user_phrases(self, udb, only_defined=False):
'''extract user phrases from database'''
try:
db = sqlite3.connect(udb)
except:
return None
if only_defined:
_phrases = db.execute(\
"SELECT clen, phrase, freq, sum(user_freq)\
FROM phrases \
WHERE freq=-1 AND mlen != 0 \
GROUP BY clen,phrase;").fetchall()
else:
_phrases = db.execute(\
"SELECT clen, phrase, freq, sum(user_freq)\
FROM phrases\
WHERE mlen !=0 \
GROUP BY clen,phrase;").fetchall()
db.commit()
return _phrases[:]