Package translate :: Package storage :: Module poparser
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.poparser

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2007 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  import re 
 23   
 24  """ 
 25  From the GNU gettext manual: 
 26       WHITE-SPACE 
 27       #  TRANSLATOR-COMMENTS 
 28       #. AUTOMATIC-COMMENTS 
 29       #| PREVIOUS MSGID                 (Gettext 0.16 - check if this is the correct position - not yet implemented) 
 30       #: REFERENCE... 
 31       #, FLAG... 
 32       msgctxt CONTEXT                   (Gettext 0.15) 
 33       msgid UNTRANSLATED-STRING 
 34       msgstr TRANSLATED-STRING 
 35  """ 
 36   
 37  isspace = str.isspace 
 38  find = str.find 
 39  rfind = str.rfind 
 40  startswith = str.startswith 
 41  append = list.append 
 42  decode = str.decode 
 43   
 44   
45 -class ParseState(object):
46
47 - def __init__(self, input_iterator, UnitClass, encoding=None):
48 self._input_iterator = input_iterator 49 self.next_line = '' 50 self.eof = False 51 self.encoding = encoding 52 self.read_line() 53 self.UnitClass = UnitClass
54
55 - def decode(self, string):
56 if self.encoding is not None: 57 return decode(string, self.encoding) 58 else: 59 return string
60
61 - def read_line(self):
62 current = self.next_line 63 if self.eof: 64 return current 65 try: 66 self.next_line = self._input_iterator.next() 67 while not self.eof and isspace(self.next_line): 68 self.next_line = self._input_iterator.next() 69 except StopIteration: 70 self.next_line = '' 71 self.eof = True 72 return current
73
74 - def new_input(self, _input):
75 return ParseState(_input, self.UnitClass, self.encoding)
76 77
78 -def read_prevmsgid_lines(parse_state):
79 """Read all the lines belonging starting with #|. These lines contain 80 the previous msgid and msgctxt info. We strip away the leading '#| ' 81 and read until we stop seeing #|.""" 82 prevmsgid_lines = [] 83 next_line = parse_state.next_line 84 while startswith(next_line, '#| ') or startswith(next_line, '| '): 85 content = parse_state.read_line() 86 prefix_len = content.index('| ') 87 content = content[prefix_len+2:] 88 append(prevmsgid_lines, content) 89 next_line = parse_state.next_line 90 return prevmsgid_lines
91 92
93 -def parse_prev_msgctxt(parse_state, unit):
94 parse_message(parse_state, 'msgctxt', 7, unit.prev_msgctxt) 95 return len(unit.prev_msgctxt) > 0
96 97
98 -def parse_prev_msgid(parse_state, unit):
99 parse_message(parse_state, 'msgid', 5, unit.prev_msgid) 100 return len(unit.prev_msgid) > 0
101 102
103 -def parse_prev_msgid_plural(parse_state, unit):
104 parse_message(parse_state, 'msgid_plural', 12, unit.prev_msgid_plural) 105 return len(unit.prev_msgid_plural) > 0
106 107
108 -def parse_comment(parse_state, unit):
109 next_line = parse_state.next_line.lstrip() 110 if len(next_line) > 0 and next_line[0] in ('#', '|'): 111 next_char = next_line[1] 112 if next_char == '.': 113 append(unit.automaticcomments, parse_state.decode(next_line)) 114 elif next_line[0] == '|' or next_char == '|': 115 # Read all the lines starting with #| 116 prevmsgid_lines = read_prevmsgid_lines(parse_state) 117 # Create a parse state object that holds these lines 118 ps = parse_state.new_input(iter(prevmsgid_lines)) 119 # Parse the msgctxt if any 120 parse_prev_msgctxt(ps, unit) 121 # Parse the msgid if any 122 parse_prev_msgid(ps, unit) 123 # Parse the msgid_plural if any 124 parse_prev_msgid_plural(ps, unit) 125 return parse_state.next_line 126 elif next_char == ':': 127 append(unit.sourcecomments, parse_state.decode(next_line)) 128 elif next_char == ',': 129 append(unit.typecomments, parse_state.decode(next_line)) 130 elif next_char == '~': 131 # Special case: we refuse to parse obsoletes: they are done 132 # elsewhere to ensure we reuse the normal unit parsing code 133 return None 134 else: 135 append(unit.othercomments, parse_state.decode(next_line)) 136 return parse_state.read_line() 137 else: 138 return None
139 140
141 -def parse_comments(parse_state, unit):
142 if not parse_comment(parse_state, unit): 143 return None 144 else: 145 while parse_comment(parse_state, unit): 146 pass 147 return True
148 149
150 -def read_obsolete_lines(parse_state):
151 """Read all the lines belonging to the current unit if obsolete.""" 152 obsolete_lines = [] 153 next_line = parse_state.next_line 154 while startswith(next_line, '#~'): 155 content = parse_state.read_line()[2:].lstrip() 156 append(obsolete_lines, content) 157 next_line = parse_state.next_line 158 if startswith(content, 'msgstr'): 159 # now we saw a msgstr, so we need to become more conservative to 160 # avoid parsing into the following unit 161 while startswith(next_line, '#~ "') or startswith(next_line, '#~ msgstr'): 162 content = parse_state.read_line()[3:] 163 append(obsolete_lines, content) 164 next_line = parse_state.next_line 165 break 166 return obsolete_lines
167 168
169 -def parse_obsolete(parse_state, unit):
170 obsolete_lines = read_obsolete_lines(parse_state) 171 if obsolete_lines == []: 172 return None 173 unit = parse_unit(parse_state.new_input(iter(obsolete_lines)), unit) 174 if unit is not None: 175 unit.makeobsolete() 176 return unit
177 178
179 -def parse_quoted(parse_state, start_pos=0):
180 line = parse_state.next_line 181 left = find(line, '"', start_pos) 182 if left == start_pos or isspace(line[start_pos:left]): 183 right = rfind(line, '"') 184 if left != right: 185 return parse_state.read_line()[left:right+1] 186 else: 187 # There is no terminating quote, so we append an extra quote, but 188 # we also ignore the newline at the end (therefore the -1) 189 return parse_state.read_line()[left:-1] + '"' 190 return None
191 192
193 -def parse_msg_comment(parse_state, msg_comment_list, string):
194 while string is not None: 195 append(msg_comment_list, parse_state.decode(string)) 196 if find(string, '\\n') > -1: 197 return parse_quoted(parse_state) 198 string = parse_quoted(parse_state) 199 return None
200 201
202 -def parse_multiple_quoted(parse_state, msg_list, msg_comment_list, first_start_pos=0):
203 string = parse_quoted(parse_state, first_start_pos) 204 while string is not None: 205 if not startswith(string, '"_:'): 206 append(msg_list, parse_state.decode(string)) 207 string = parse_quoted(parse_state) 208 else: 209 string = parse_msg_comment(parse_state, msg_comment_list, string)
210 211
212 -def parse_message(parse_state, start_of_string, start_of_string_len, msg_list, msg_comment_list=None):
213 if msg_comment_list is None: 214 msg_comment_list = [] 215 if startswith(parse_state.next_line, start_of_string): 216 return parse_multiple_quoted(parse_state, msg_list, msg_comment_list, start_of_string_len)
217 218
219 -def parse_msgctxt(parse_state, unit):
220 parse_message(parse_state, 'msgctxt', 7, unit.msgctxt) 221 return len(unit.msgctxt) > 0
222 223
224 -def parse_msgid(parse_state, unit):
225 parse_message(parse_state, 'msgid', 5, unit.msgid, unit.msgidcomments) 226 return len(unit.msgid) > 0 or len(unit.msgidcomments) > 0
227 228
229 -def parse_msgstr(parse_state, unit):
230 parse_message(parse_state, 'msgstr', 6, unit.msgstr) 231 return len(unit.msgstr) > 0
232 233
234 -def parse_msgid_plural(parse_state, unit):
235 parse_message(parse_state, 'msgid_plural', 12, unit.msgid_plural, unit.msgid_pluralcomments) 236 return len(unit.msgid_plural) > 0 or len(unit.msgid_pluralcomments) > 0
237 238 MSGSTR_ARRAY_ENTRY_LEN = len('msgstr[') 239 240
241 -def add_to_dict(msgstr_dict, line, right_bracket_pos, entry):
242 index = int(line[MSGSTR_ARRAY_ENTRY_LEN:right_bracket_pos]) 243 if index not in msgstr_dict: 244 msgstr_dict[index] = [] 245 msgstr_dict[index].extend(entry)
246 247
248 -def get_entry(parse_state, right_bracket_pos):
249 entry = [] 250 parse_message(parse_state, 'msgstr[', right_bracket_pos + 1, entry) 251 return entry
252 253
254 -def parse_msgstr_array_entry(parse_state, msgstr_dict):
255 line = parse_state.next_line 256 right_bracket_pos = find(line, ']', MSGSTR_ARRAY_ENTRY_LEN) 257 if right_bracket_pos >= 0: 258 entry = get_entry(parse_state, right_bracket_pos) 259 if len(entry) > 0: 260 add_to_dict(msgstr_dict, line, right_bracket_pos, entry) 261 return True 262 else: 263 return False 264 else: 265 return False
266 267
268 -def parse_msgstr_array(parse_state, unit):
269 msgstr_dict = {} 270 result = parse_msgstr_array_entry(parse_state, msgstr_dict) 271 if not result: # We require at least one result 272 return False 273 while parse_msgstr_array_entry(parse_state, msgstr_dict): 274 pass 275 unit.msgstr = msgstr_dict 276 return True
277 278
279 -def parse_plural(parse_state, unit):
280 if parse_msgid_plural(parse_state, unit) and \ 281 (parse_msgstr_array(parse_state, unit) or parse_msgstr(parse_state, unit)): 282 return True 283 else: 284 return False
285 286
287 -def parse_msg_entries(parse_state, unit):
288 parse_msgctxt(parse_state, unit) 289 if parse_msgid(parse_state, unit) and \ 290 (parse_msgstr(parse_state, unit) or parse_plural(parse_state, unit)): 291 return True 292 else: 293 return False
294 295
296 -def parse_unit(parse_state, unit=None):
297 unit = unit or parse_state.UnitClass() 298 parsed_comments = parse_comments(parse_state, unit) 299 obsolete_unit = parse_obsolete(parse_state, unit) 300 if obsolete_unit is not None: 301 return obsolete_unit 302 parsed_msg_entries = parse_msg_entries(parse_state, unit) 303 if parsed_comments or parsed_msg_entries: 304 unit.infer_state() 305 return unit 306 else: 307 return None
308 309
310 -def set_encoding(parse_state, store, unit):
311 charset = None 312 if isinstance(unit.msgstr, list) and len(unit.msgstr) > 0 and isinstance(unit.msgstr[0], str): 313 charset = re.search("charset=([^\\s\\\\n]+)", "".join(unit.msgstr)) 314 if charset: 315 encoding = charset.group(1) 316 if encoding != 'CHARSET': 317 store._encoding = encoding 318 else: 319 store._encoding = 'utf-8' 320 else: 321 store._encoding = 'utf-8' 322 parse_state.encoding = store._encoding
323 324
325 -def decode_list(lst, decode):
326 return [decode(item) for item in lst]
327 328
329 -def decode_header(unit, decode):
330 for attr in ('msgctxt', 'msgid', 'msgid_pluralcomments', 331 'msgid_plural', 'msgstr', 'obsoletemsgctxt', 332 'obsoletemsgid', 'obsoletemsgid_pluralcomments', 333 'obsoletemsgid_plural', 'obsoletemsgstr', 334 'othercomments', 'automaticcomments', 'sourcecomments', 335 'typecomments', 'msgidcomments', 'obsoletemsgidcomments'): 336 element = getattr(unit, attr) 337 if isinstance(element, list): 338 setattr(unit, attr, decode_list(element, decode)) 339 else: 340 setattr(unit, attr, dict([(key, decode_list(value, decode)) for key, value in element.items()]))
341 342
343 -def parse_header(parse_state, store):
344 first_unit = parse_unit(parse_state) 345 if first_unit is None: 346 return None 347 set_encoding(parse_state, store, first_unit) 348 decode_header(first_unit, parse_state.decode) 349 return first_unit
350 351
352 -def parse_units(parse_state, store):
353 unit = parse_header(parse_state, store) 354 while unit: 355 store.addunit(unit) 356 unit = parse_unit(parse_state) 357 return parse_state.eof
358