Package translate :: Package storage :: Module tmdb
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.tmdb

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Module to provide a translation memory database.""" 
 23   
 24  import logging 
 25  import math 
 26  import re 
 27  import threading 
 28  import time 
 29  try: 
 30      from sqlite3 import dbapi2 
 31  except ImportError: 
 32      from pysqlite2 import dbapi2 
 33   
 34  from translate.search.lshtein import LevenshteinComparer 
 35  from translate.lang import data 
 36   
 37   
 38  STRIP_REGEXP = re.compile("\W", re.UNICODE) 
 39   
 40   
41 -class LanguageError(Exception):
42
43 - def __init__(self, value):
44 self.value = value
45
46 - def __str__(self):
47 return str(self.value)
48 49
50 -class TMDB(object):
51 _tm_dbs = {} 52
53 - def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000):
54 55 self.max_candidates = max_candidates 56 self.min_similarity = min_similarity 57 self.max_length = max_length 58 59 self.db_file = db_file 60 # share connections to same database file between different instances 61 if db_file not in self._tm_dbs: 62 self._tm_dbs[db_file] = {} 63 self._tm_db = self._tm_dbs[db_file] 64 65 #FIXME: do we want to do any checks before we initialize the DB? 66 self.init_database() 67 self.fulltext = False 68 self.init_fulltext() 69 70 self.comparer = LevenshteinComparer(self.max_length) 71 72 self.preload_db()
73
74 - def _get_connection(self, index):
75 current_thread = threading.currentThread() 76 if current_thread not in self._tm_db: 77 connection = dbapi2.connect(self.db_file) 78 cursor = connection.cursor() 79 self._tm_db[current_thread] = (connection, cursor) 80 return self._tm_db[current_thread][index]
81 82 connection = property(lambda self: self._get_connection(0)) 83 cursor = property(lambda self: self._get_connection(1)) 84
85 - def init_database(self):
86 """creates database tables and indices""" 87 88 script = """ 89 CREATE TABLE IF NOT EXISTS sources ( 90 sid INTEGER PRIMARY KEY AUTOINCREMENT, 91 text VARCHAR NOT NULL, 92 context VARCHAR DEFAULT NULL, 93 lang VARCHAR NOT NULL, 94 length INTEGER NOT NULL 95 ); 96 CREATE INDEX IF NOT EXISTS sources_context_idx ON sources (context); 97 CREATE INDEX IF NOT EXISTS sources_lang_idx ON sources (lang); 98 CREATE INDEX IF NOT EXISTS sources_length_idx ON sources (length); 99 CREATE UNIQUE INDEX IF NOT EXISTS sources_uniq_idx ON sources (text, context, lang); 100 101 CREATE TABLE IF NOT EXISTS targets ( 102 tid INTEGER PRIMARY KEY AUTOINCREMENT, 103 sid INTEGER NOT NULL, 104 text VARCHAR NOT NULL, 105 lang VARCHAR NOT NULL, 106 time INTEGER DEFAULT NULL, 107 FOREIGN KEY (sid) references sources(sid) 108 ); 109 CREATE INDEX IF NOT EXISTS targets_sid_idx ON targets (sid); 110 CREATE INDEX IF NOT EXISTS targets_lang_idx ON targets (lang); 111 CREATE INDEX IF NOT EXISTS targets_time_idx ON targets (time); 112 CREATE UNIQUE INDEX IF NOT EXISTS targets_uniq_idx ON targets (sid, text, lang); 113 """ 114 115 try: 116 self.cursor.executescript(script) 117 self.connection.commit() 118 except: 119 self.connection.rollback() 120 raise
121
122 - def init_fulltext(self):
123 """detects if fts3 fulltext indexing module exists, initializes fulltext table if it does""" 124 125 #HACKISH: no better way to detect fts3 support except trying to construct a dummy table?! 126 try: 127 script = """ 128 DROP TABLE IF EXISTS test_for_fts3; 129 CREATE VIRTUAL TABLE test_for_fts3 USING fts3; 130 DROP TABLE test_for_fts3; 131 """ 132 self.cursor.executescript(script) 133 logging.debug("fts3 supported") 134 # for some reason CREATE VIRTUAL TABLE doesn't support IF NOT EXISTS syntax 135 # check if fulltext index table exists manually 136 self.cursor.execute("SELECT name FROM sqlite_master WHERE name = 'fulltext'") 137 if not self.cursor.fetchone(): 138 # create fulltext index table, and index all strings in sources 139 script = """ 140 CREATE VIRTUAL TABLE fulltext USING fts3(text); 141 """ 142 logging.debug("fulltext table not exists, creating") 143 self.cursor.executescript(script) 144 logging.debug("created fulltext table") 145 else: 146 logging.debug("fulltext table already exists") 147 148 # create triggers that would sync sources table with fulltext index 149 script = """ 150 INSERT INTO fulltext (rowid, text) SELECT sid, text FROM sources WHERE sid NOT IN (SELECT rowid FROM fulltext); 151 CREATE TRIGGER IF NOT EXISTS sources_insert_trig AFTER INSERT ON sources FOR EACH ROW 152 BEGIN 153 INSERT INTO fulltext (docid, text) VALUES (NEW.sid, NEW.text); 154 END; 155 CREATE TRIGGER IF NOT EXISTS sources_update_trig AFTER UPDATE OF text ON sources FOR EACH ROW 156 BEGIN 157 UPDATE fulltext SET text = NEW.text WHERE docid = NEW.sid; 158 END; 159 CREATE TRIGGER IF NOT EXISTS sources_delete_trig AFTER DELETE ON sources FOR EACH ROW 160 BEGIN 161 DELETE FROM fulltext WHERE docid = OLD.sid; 162 END; 163 """ 164 self.cursor.executescript(script) 165 self.connection.commit() 166 logging.debug("created fulltext triggers") 167 self.fulltext = True 168 169 except dbapi2.OperationalError, e: 170 self.fulltext = False 171 logging.debug("failed to initialize fts3 support: " + str(e)) 172 script = """ 173 DROP TRIGGER IF EXISTS sources_insert_trig; 174 DROP TRIGGER IF EXISTS sources_update_trig; 175 DROP TRIGGER IF EXISTS sources_delete_trig; 176 """ 177 self.cursor.executescript(script)
178
179 - def preload_db(self):
180 """ugly hack to force caching of sqlite db file in memory for 181 improved performance""" 182 if self.fulltext: 183 query = """SELECT COUNT(*) FROM sources s JOIN fulltext f ON s.sid = f.docid JOIN targets t on s.sid = t.sid""" 184 else: 185 query = """SELECT COUNT(*) FROM sources s JOIN targets t on s.sid = t.sid""" 186 self.cursor.execute(query) 187 (numrows,) = self.cursor.fetchone() 188 logging.debug("tmdb has %d records" % numrows) 189 return numrows
190
191 - def add_unit(self, unit, source_lang=None, target_lang=None, commit=True):
192 """inserts unit in the database""" 193 #TODO: is that really the best way to handle unspecified 194 # source and target languages? what about conflicts between 195 # unit attributes and passed arguments 196 if unit.getsourcelanguage(): 197 source_lang = unit.getsourcelanguage() 198 if unit.gettargetlanguage(): 199 target_lang = unit.gettargetlanguage() 200 201 if not source_lang: 202 raise LanguageError("undefined source language") 203 if not target_lang: 204 raise LanguageError("undefined target language") 205 206 unitdict = {"source": unit.source, 207 "target": unit.target, 208 "context": unit.getcontext(), 209 } 210 self.add_dict(unitdict, source_lang, target_lang, commit)
211
212 - def add_dict(self, unit, source_lang, target_lang, commit=True):
213 """inserts units represented as dictionaries in database""" 214 source_lang = data.normalize_code(source_lang) 215 target_lang = data.normalize_code(target_lang) 216 try: 217 try: 218 self.cursor.execute("INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)", 219 (unit["source"], 220 unit["context"], 221 source_lang, 222 len(unit["source"]))) 223 sid = self.cursor.lastrowid 224 except dbapi2.IntegrityError: 225 # source string already exists in db, run query to find sid 226 self.cursor.execute("SELECT sid FROM sources WHERE text=? AND context=? and lang=?", 227 (unit["source"], 228 unit["context"], 229 source_lang)) 230 sid = self.cursor.fetchone() 231 (sid,) = sid 232 try: 233 #FIXME: get time info from translation store 234 #FIXME: do we need so store target length? 235 self.cursor.execute("INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)", 236 (sid, 237 unit["target"], 238 target_lang, 239 int(time.time()))) 240 except dbapi2.IntegrityError: 241 # target string already exists in db, do nothing 242 pass 243 244 if commit: 245 self.connection.commit() 246 except: 247 if commit: 248 self.connection.rollback() 249 raise
250
251 - def add_store(self, store, source_lang, target_lang, commit=True):
252 """insert all units in store in database""" 253 count = 0 254 for unit in store.units: 255 if unit.istranslatable() and unit.istranslated(): 256 self.add_unit(unit, source_lang, target_lang, commit=False) 257 count += 1 258 if commit: 259 self.connection.commit() 260 return count
261
262 - def add_list(self, units, source_lang, target_lang, commit=True):
263 """insert all units in list into the database, units are 264 represented as dictionaries""" 265 count = 0 266 for unit in units: 267 self.add_dict(unit, source_lang, target_lang, commit=False) 268 count += 1 269 if commit: 270 self.connection.commit() 271 return count
272
273 - def translate_unit(self, unit_source, source_langs, target_langs):
274 """return TM suggestions for unit_source""" 275 if isinstance(unit_source, str): 276 unit_source = unicode(unit_source, "utf-8") 277 if isinstance(source_langs, list): 278 source_langs = [data.normalize_code(lang) for lang in source_langs] 279 source_langs = ','.join(source_langs) 280 else: 281 source_langs = data.normalize_code(source_langs) 282 if isinstance(target_langs, list): 283 target_langs = [data.normalize_code(lang) for lang in target_langs] 284 target_langs = ','.join(target_langs) 285 else: 286 target_langs = data.normalize_code(target_langs) 287 288 minlen = min_levenshtein_length(len(unit_source), self.min_similarity) 289 maxlen = max_levenshtein_length(len(unit_source), self.min_similarity, self.max_length) 290 291 # split source into words, remove punctuation and special 292 # chars, keep words that are at least 3 chars long 293 unit_words = STRIP_REGEXP.sub(' ', unit_source).split() 294 unit_words = filter(lambda word: len(word) > 2, unit_words) 295 296 if self.fulltext and len(unit_words) > 3: 297 logging.debug("fulltext matching") 298 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid 299 WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ? 300 AND fulltext MATCH ?""" 301 search_str = " OR ".join(unit_words) 302 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen, search_str)) 303 else: 304 logging.debug("nonfulltext matching") 305 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid 306 WHERE s.lang IN (?) AND t.lang IN (?) 307 AND s.length >= ? AND s.length <= ?""" 308 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen)) 309 310 results = [] 311 for row in self.cursor: 312 result = {} 313 result['source'] = row[0] 314 result['target'] = row[1] 315 result['context'] = row[2] 316 result['quality'] = self.comparer.similarity(unit_source, result['source'], self.min_similarity) 317 if result['quality'] >= self.min_similarity: 318 results.append(result) 319 results.sort(key=lambda match: match['quality'], reverse=True) 320 results = results[:self.max_candidates] 321 logging.debug("results: %s", unicode(results)) 322 return results
323 324
325 -def min_levenshtein_length(length, min_similarity):
326 return math.ceil(max(length * (min_similarity/100.0), 2))
327 328
329 -def max_levenshtein_length(length, min_similarity, max_length):
330 return math.floor(min(length / (min_similarity/100.0), max_length))
331