-
-
Save aronasorman/e8f77ad7444d55c5a5e1 to your computer and use it in GitHub Desktop.
perseus i18n files from KA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Extracts translatable strings from HTML exercise files. | |
This program is used for extracting translatable strings from | |
exercise files and optionally outputting a PO or JSON file to be used | |
for further translation. | |
How it works: The script goes through the HTML files and attempts | |
to locate all nodes that have text as a direct child node. It then | |
extracts the HTML contents of that node and returns it as a translatable | |
string. Note that certain nodes are excluded from this list as they | |
contain non-translatable text (see _IGNORE_NODES). It is assumed that | |
everything that isn't part of _IGNORE_NODES is something that needs to be | |
translated. | |
We also call out to babel to extract javascript inside exercises (in | |
<script> tags, but also in some other places where javascript is | |
allowed in exercise html files). | |
""" | |
import cStringIO | |
import HTMLParser | |
import argparse | |
import json | |
import os.path | |
import re | |
import sys | |
# Make sure we can import third_party even when run from the commandline. | |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
import third_party.babel.messages.extract | |
from third_party import i18nize_templates | |
from third_party import polib | |
# All the tags that we want to treat as javascript. We'll call the | |
# javascript extractor on text in these tags, to extract out the | |
# natural language text from javascript. | |
_JAVASCRIPT_TAGS = frozenset(( | |
'script', | |
'var', | |
)) | |
# We also extract javascript from the innerhtml of tags that contain a | |
# given attribute/value pair. We consider a tag to hold javascript if | |
# it has the given attribute name and *CONTAINS* the given value text | |
# inside its attribute value, so '<span class="foo-guess-bar">' matches. | |
_JAVASCRIPT_ATTRVALS = frozenset(( | |
('class', 'graphie'), | |
('class', 'guess'), | |
('class', 'validator-function'), | |
)) | |
# TODO(csilvers): Convert to a more efficient data structure. | |
_JAVASCRIPT_ATTRVAL_ATTRS = frozenset((attr for (attr, _) in | |
_JAVASCRIPT_ATTRVALS)) | |
# Unlike _JAVASCRIPT_ATTRVALS, which says a tag contains javascript if | |
# it has a given attrval, this says that the attribute value *itself* | |
# is javascript. For instance <span data-if="<some javascript>">. | |
_JAVASCRIPT_ATTRS = frozenset(( | |
'data-choices', | |
'data-else', | |
'data-ensure', | |
'data-if', | |
'data-if-else', | |
)) | |
# All the tags that we want to ignore and not extract strings from. | |
_IGNORE_TAGS = frozenset(( | |
'code', | |
'style', | |
)) | |
# Ignore all tags with this attribute name/value pair. We ignore a | |
# tag if it has the given attribute name and *CONTAINS* the given | |
# value text inside its attribute value, so | |
# '<span data-type="a-regexp">' matches. | |
_IGNORE_ATTRVALS = frozenset(( | |
('data-type', 'regex'), | |
)) | |
# TODO(csilvers): Convert to a more efficient data structure. | |
_IGNORE_ATTRVAL_ATTRS = frozenset((attr for (attr, _) in _IGNORE_ATTRVALS)) | |
# The base URL for referencing an exercise | |
_EXERCISE_URL = 'http://www.khanacademy.org/exercise/%s' | |
class UnmatchedEndTagError(Exception): | |
def __init__(self, tagname, linenum, colnum): | |
super(UnmatchedEndTagError, self).__init__() | |
self.linenum = linenum | |
self.colnum = colnum | |
self.tagname = tagname | |
def __str__(self): | |
# Not super-useful since we don't know the filename! | |
return ('No start tag found for end-tag </%s> at line %s, col %s' | |
% (self.tagname, self.linenum, self.colnum)) | |
def main(): | |
"""Handle running this program from the command-line.""" | |
# Handle parsing the program arguments | |
arg_parser = argparse.ArgumentParser( | |
description='Extract translatable strings from HTML exercise files.') | |
arg_parser.add_argument('html_files', nargs='+', | |
help='The HTML exercise files to extract strings from.') | |
arg_parser.add_argument('--output', dest='output', | |
help='The file to write the output to.') | |
arg_parser.add_argument('--format', choices=['po', 'json'], | |
dest='format', default='po', | |
help='The format of the output. (default: %(default)s)') | |
arg_parser.add_argument('--quiet', action='store_true', | |
help='Do not emit status to stderr on successful runs.') | |
args = arg_parser.parse_args() | |
if args.format == 'po': | |
# Output a PO file by default | |
results = unicode(make_potfile(args.html_files, | |
not args.quiet)).encode('utf-8') | |
else: | |
# Optionally output a JSON-encoded data structure | |
results = json.dumps(extract_files(args.html_files, not args.quiet), | |
cls=_SetEncoder, indent=2) | |
if args.output: | |
# If an output location is specified, write the output to that file | |
output_file = open(args.output, 'w') | |
output_file.write(results) | |
output_file.close() | |
else: | |
# Otherwise just write the output to STDOUT | |
print results | |
def make_potfile(files, verbose): | |
"""Generate a PO file from a collection of HTML files. | |
Returns the string representing the PO file. | |
""" | |
# Turn off line-wrapping: it can mess with html markup inside PO comments. | |
output_pot = polib.POFile(wrapwidth=sys.maxint, encoding='utf-8') | |
matches = extract_files(files, verbose) | |
for (nl_text, comments, occurrences) in matches: | |
# Build the PO entry and add it to the PO file | |
# If nltext is a tuple, it means we have a plural (ngettext) entry | |
if isinstance(nl_text, basestring): | |
(msgid, msgid_plural) = (unicode(nl_text), None) | |
(msgstr, msgstr_plural) = ("", None) | |
else: | |
(msgid, msgid_plural) = (unicode(nl_text[0]), unicode(nl_text[1])) | |
(msgstr, msgstr_plural) = (None, {0: u"", 1: u""}) | |
output_pot.append(polib.POEntry( | |
msgid=msgid, | |
msgid_plural=msgid_plural, | |
msgstr=msgstr, | |
msgstr_plural=msgstr_plural, | |
comment='\n'.join(comments), | |
occurrences=occurrences, | |
)) | |
return output_pot | |
def extract_files(files, verbose): | |
"""Extract a collection of translatable strings from a set of HTML files. | |
Returns: | |
A list of natural language texts and their occurrences: | |
[(nl-text, (comment, ...), | |
((1st-file, 1st-linenum), (2nd-file, 2nd-linenum), ...)), | |
... | |
] | |
For each nl-text, the (file, linenum) pairs are sorted in | |
lexicographic order (first by filename, then by line-number). | |
The list of natural-language texts is sorted by the (1st-file, | |
1st-linenum), to maximize the chances texts from the same file | |
will sort together. | |
""" | |
matches = {} | |
comments = {} | |
# Go through all the exercise files. | |
for filename in files: | |
if verbose: | |
print >>sys.stderr, 'Extracting strings from: %s' % filename | |
extract_file(filename, matches, comments) | |
if verbose: | |
num_matches = len(matches) | |
print >>sys.stderr, ('%s string%s extracted.' | |
% (num_matches, "" if num_matches == 1 else "s")) | |
# Get the matches into the return format. | |
retval = [] | |
for (nl_text, occurrences) in matches.iteritems(): | |
occurrences = sorted(occurrences) | |
# Get the file name of the exercise, to generate a URL reference | |
first_filename = occurrences[0][0] | |
msg_comments = list(comments.get(nl_text, [])) | |
msg_comments.append( | |
unicode('Text is in <%s>' % _filename_to_url(first_filename))) | |
retval.append((nl_text, msg_comments, occurrences)) | |
# Now sort the nl-texts so they come in order of their first occurrence. | |
# We break ties -- should be rare -- arbitrarily. | |
retval.sort(key=lambda (nl_text, comments, occ): (occ[0], nl_text)) | |
return retval | |
class I18nExtractor(HTMLParser.HTMLParser): | |
"""Lexes html, and returns interesting tags via get_node(). | |
'Interesting' tags are those that a) have text directly in them | |
(that is, not inside some nested tag), b) are not in _IGNORE_TAGS | |
or _IGNORE_ATTRVALS, and c) are not nested inside another | |
'interesting' tag. The data returned is the full html of that | |
tag: that is, everything between <tag> and </endtag> | |
(non-inclusive). Whitespace in the return value is collapsed. | |
This assumes well-formed html! It will do weird things otherwise. | |
NOTE: This class is used by webapp:kake/translate_exercises.py, | |
so be careful about code there if you refactor here. | |
""" | |
class TagInfo(object): | |
def __init__(self, tagname, attrs, | |
startline, startpos, outer_startpos, parent_tag_info): | |
self.tagname = tagname | |
self.attrs = attrs | |
self.should_emit_tag = True # start optimistic | |
self.tag_has_non_whitespace = False # start pessimistic | |
self.startline = startline # line # into text | |
self.startpos = startpos # offset into text | |
self.endpos = None # will point after text | |
self.outer_startpos = outer_startpos # offset into tag | |
self.outer_endpos = None # will point after close-tag | |
# We know right away we shouldn't emit a tag if any of the | |
# following are true: we shouldn't emit the tag's parent, | |
# the tag is in _IGNORE_TAGS, or the tag has some | |
# attribute values that are in _IGNORE_ATTRVALS. | |
if self.should_emit_tag: | |
self.should_emit_tag = (not parent_tag_info or | |
parent_tag_info.should_emit_tag) | |
if self.should_emit_tag: | |
self.should_emit_tag = not self._tag_matches( | |
_IGNORE_TAGS, _IGNORE_ATTRVAL_ATTRS, _IGNORE_ATTRVALS) | |
_IS_SINGULAR_RE = re.compile('^isSingular\((.*)\)$') | |
def _tag_matches(self, tagnames, attrs, attrvals): | |
"""True if our tag is in tagnames or an attr/val is in attrvals.""" | |
if self.tagname in tagnames: | |
return True | |
for attrval in self.attrs: | |
if attrval[0] in attrs: | |
for (match_attr, match_tag) in attrvals: | |
if (match_attr == attrval[0] and | |
match_tag in attrval[1]): # *CONTAINS* | |
return True | |
return False | |
def is_javascript_tag(self): | |
"""True if this tag contains javascript in its innerhtml.""" | |
return self._tag_matches(_JAVASCRIPT_TAGS, | |
_JAVASCRIPT_ATTRVAL_ATTRS, | |
_JAVASCRIPT_ATTRVALS) | |
def nltext_attrvals(self): | |
"""Return attribute values for value/etc attributes in tag.""" | |
# We re-use the code in i18nize_templates which already | |
# finds natural language text in tag attributes. | |
return [self.attrs[i][1] | |
for i in i18nize_templates.natural_language_attributes( | |
self.tagname, self.attrs)] | |
def javascript_attrvals(self): | |
"""Return attribute values for data-if/etc attributes in tag.""" | |
return [val for (attr, val) in self.attrs | |
if attr in _JAVASCRIPT_ATTRS] | |
def is_singular(self): | |
"""foo if this tag has an data-if="isSingular(foo)", else None.""" | |
for (attr, val) in self.attrs: | |
if attr == 'data-if': | |
m = self._IS_SINGULAR_RE.match(val) | |
if m: | |
return m.group(1) | |
return None | |
def __init__(self, *args, **kwargs): | |
HTMLParser.HTMLParser.__init__(self, *args, **kwargs) | |
self.tagstack = [] # stack (list) of TagInfo's. | |
self.candidates = [] # tags that we provisionally should emit | |
self.script_nodes = [] # TagInfos for js content in this html | |
self.script_attrvals = [] # (TagInfo, attr-val) where attrval is js | |
self.nltext_attrvals = [] # (TagInfo, attr-val) where attrval is text | |
def _line_offset_to_pos(self, line_and_offset): | |
"""Input is a tuple (5, 10): 10th char of the 5th line.""" | |
return self.linepos[line_and_offset[0]] + line_and_offset[1] | |
# Older versions of HTMLParser (2.7.1, at least) have a bug where | |
# they end <script> tags on *any* </tag>, not just </script>. Fix | |
# that. Later versions of Python fix the bug but add a new arg | |
# to set_cdata_mode(), so we have to handle that too. | |
def set_cdata_mode(self, *args): | |
self.interesting = re.compile(r'</%s' % re.escape(self.lasttag)) | |
# Sadly, there's no regexp pattern that just matches unicode | |
# alphabetic characters, so we do the inverse: non-alnums + digits | |
# = non-alphabetic. | |
_NO_ALPHA_RE = re.compile('^[\W\d]+$', re.UNICODE) | |
def handle_starttag(self, tag, attrs): | |
# Read past the start-tag; startpos is the start of the 'inner' html. | |
startline = self.getpos()[0] | |
outer_startpos = self._line_offset_to_pos(self.getpos()) | |
startpos = outer_startpos + len(self.get_starttag_text()) | |
taginfo = I18nExtractor.TagInfo( | |
tag, attrs, startline, startpos, outer_startpos, | |
self.tagstack[-1] if self.tagstack else None) | |
# Sometimes tags have attributes whose values are natural | |
# language text (e.g. <input value="some text">). If so, | |
# store that fact. | |
for nltext_val in taginfo.nltext_attrvals(): | |
if nltext_val and not self._NO_ALPHA_RE.match(nltext_val): | |
self.nltext_attrvals.append((taginfo, nltext_val)) | |
# Sometimes tags have attributes whose values are javascript | |
# (e.g. <span data-if="some_javascript_code">). If so, store | |
# that fact. | |
for js_val in taginfo.javascript_attrvals(): | |
if js_val: | |
self.script_attrvals.append((taginfo, js_val)) | |
self.tagstack.append(taginfo) | |
def handle_endtag(self, tag): | |
# We need the while because not all tags have end-tags (e.g. <meta>) | |
while self.tagstack and self.tagstack[-1].tagname != tag: | |
self.tagstack.pop() | |
# This can fail if the html is not well-formed (no balanced tags) | |
if not self.tagstack: | |
(linenum, colnum) = self.getpos() | |
raise UnmatchedEndTagError(tag, linenum, colnum) | |
tag_info = self.tagstack.pop() | |
# Update endpos | |
tag_info.endpos = self._line_offset_to_pos(self.getpos()) | |
# outer_endpos points after the end of this tag. HTMLParser | |
# doesn't expose this info, so we depend on the fact end-tags | |
# can't have tag-attrs, so searching for '>' is good enough. | |
tag_info.outer_endpos = self.text.index('>', tag_info.endpos) + 1 | |
# If the tag's innerhtml is javascript, add it to our list of | |
# javascript nodes. Else if tagname-is-good is set, and the | |
# tag contains non-ws text, then its contents are a candidate | |
# to be extracted. However, its parents get dibs: we don't | |
# extract this if we extract a parent. We will have to wait | |
# until the parent is done, to see. | |
if tag_info.is_javascript_tag(): | |
self.script_nodes.append(tag_info) | |
elif tag_info.should_emit_tag and tag_info.tag_has_non_whitespace: | |
self.candidates.append(tag_info) | |
def handle_data(self, data): | |
"""Callback for text between tags.""" | |
if data.strip(): # not just whitespace | |
assert self.tagstack | |
self.tagstack[-1].tag_has_non_whitespace = True | |
def handle_charref(self, charref): | |
"""Callback for data that starts with &, e.g. '.""" | |
assert self.tagstack | |
self.tagstack[-1].tag_has_non_whitespace = True | |
def feed(self, text): | |
"""Store the text so we can print from it, and make line->pos table.""" | |
self.text = text | |
self.linepos = [None, 0] # dummy 0-th line; linenums start at 1 | |
while True: | |
newline = text.find('\n', self.linepos[-1]) | |
if newline == -1: | |
break | |
self.linepos.append(newline + 1) | |
HTMLParser.HTMLParser.feed(self, text) | |
def cleaned_text(self, tag_info): | |
"""Return text between <tag> and </tag>, cleans up whitespaces. | |
Get rid of leading and trailing whitespace, and collapse runs of | |
whitespace to a single whitespace (changing newline to space). | |
Returns: | |
The cleaned text, and the line-number that the cleaned | |
text starts on. | |
""" | |
text = self.text[tag_info.startpos:tag_info.endpos] | |
# Figure out the line number of the first non-whitespace char. | |
line_number = tag_info.startline | |
leading_whitespace = text[:len(text) - len(text.lstrip())] | |
line_number += leading_whitespace.count('\n') | |
# Normalize whitespace and return. | |
return (re.sub(r'\s+', ' ', text).strip(), line_number) | |
def nltext_nodes(self): | |
"""Yields TagInfo objects representing nodes with nl-text in them.""" | |
# If one candidate is inside another one, we print the outside | |
# one. We can figure this out via sorting. | |
self.candidates.sort(key=lambda tag_info: tag_info.startpos) | |
if self.candidates: | |
yield self.candidates[0] | |
parent_range = (self.candidates[0].startpos, | |
self.candidates[0].endpos) | |
for i in xrange(1, len(self.candidates)): | |
# If we're entirely inside our parent, ignore us. | |
if (self.candidates[i].startpos >= parent_range[0] and | |
self.candidates[i].endpos <= parent_range[1]): | |
continue | |
yield self.candidates[i] | |
parent_range = (self.candidates[i].startpos, | |
self.candidates[i].endpos) | |
def javascript_nodes(self): | |
"""Yields TagInfo objects representing nodes with js-text in them.""" | |
for taginfo in self.script_nodes: | |
yield taginfo | |
def javascript_attribute_values(self): | |
"""Yields (TagInfo, attribute_value_string) when attr-value is js.""" | |
for (taginfo, attrval_string) in self.script_attrvals: | |
yield (taginfo, attrval_string) | |
def nltext_attribute_values(self): | |
"""Yields (TagInfo, attribute_value_string) when attr-value is text.""" | |
for (taginfo, attrval_string) in self.nltext_attrvals: | |
yield (taginfo, attrval_string) | |
_JS_GETTEXT_RE = re.compile(r'\$\._|\$\.ngettext') | |
def javascript_has_no_i18n_markup(js_text): | |
"""Return true if js_text does not have $._ or $.ngettext in it. | |
This is an optimization -- if a quick search for $._ or $.ngettext | |
fails, then we know we don't need to do the more expensive | |
javascript tokenization looking for strings to extract. This is | |
because the only time we have strings to extract in js is in $._() | |
and $.ngettext(). | |
Note that this can return False even if there's no actual $._ in | |
the code (for instance, if '$._' appears in a comment. That's ok | |
though; it just means we don't benefit from the optimization for | |
this file. | |
""" | |
return not _JS_GETTEXT_RE.search(js_text) | |
def _extract_javascript(filename, js_text, js_start_line, matches, comments): | |
"""js_start_line: where this javascript starts inside the html file.""" | |
# All i18n markup in javascript is via $._ and $.ngettext, so if | |
# those aren't present, we can bail, confident in the lack of nltext. | |
if javascript_has_no_i18n_markup(js_text): | |
return | |
for (lineno, funcname, message, msg_comments) in ( | |
third_party.babel.messages.extract.extract_javascript( | |
cStringIO.StringIO(js_text.encode('utf-8')), | |
third_party.babel.messages.extract.DEFAULT_KEYWORDS, | |
['I18N:'], | |
{})): | |
# the javascript extractor has a 'feature' where it appends a | |
# third None argument if an ngettext message has interpolated | |
# strings ("%(foo)s"). We ignore that. | |
if isinstance(message, tuple): | |
message = message[:2] | |
matches.setdefault(message, set()).add((filename, | |
js_start_line - 1 + lineno)) | |
comments.setdefault(message, []).extend(msg_comments) | |
def extract_file(filename, matches, comments, contents=None): | |
"""Extract a collection of translatable strings from an HTML file. | |
This function modifies matches and comments in place with new | |
content that it discovers. | |
Arguments: | |
filename: the .html file to extract natural language text from. | |
matches: a dict from found nl-strings to a set of | |
(filename, linenumber) pairs where this string is found. | |
comments: a dict from found nl-strings to a list of | |
comments about those strings (extracted from the source code). | |
contents: if not None, the contents of file 'filename'. | |
""" | |
if contents is None: | |
with open(filename) as f: | |
contents = f.read().decode('utf-8') | |
extractor = I18nExtractor() | |
extractor.feed(contents) | |
singular = None # used when collecting singular + plural for ngettext | |
singular_occ = None | |
for tag_info in extractor.nltext_nodes(): | |
(text, linenum) = extractor.cleaned_text(tag_info) | |
if singular is not None: | |
# If the last tag was the singular part of an ngettext | |
# call, we're the plural. | |
matches.setdefault((singular, text), set()).add(singular_occ) | |
# TODO(csilvers): extract comments from source and append here. | |
comments.setdefault((singular, text), []).extend([]) | |
singular = None | |
singular_occ = None | |
elif tag_info.is_singular(): | |
# If *we're* the singular part of an ngettext call, store | |
# that info so the next tag can add us to matches. | |
singular = text | |
singular_occ = (filename, linenum) | |
else: | |
# Normal, gettext call. | |
matches.setdefault(text, set()).add((filename, linenum)) | |
# TODO(csilvers): extract comments from source and append here. | |
comments.setdefault(text, []).extend([]) | |
# We also need to worry about natural language text inside tags, | |
# such as <input value="natural language text">. | |
for (tag_info, nl_text) in extractor.nltext_attribute_values(): | |
# tag_info.startline may not be exactly right, since the | |
# attribute may not be on the same line as the start of the | |
# tag, but it's hopefully close enough to be useful. | |
matches.setdefault(nl_text, set()).add((filename, tag_info.startline)) | |
# TODO(csilvers): extract comments from source and append here. | |
comments.setdefault(nl_text, []).extend([]) | |
# We also need to extract nl-strings from javascript -- in two | |
# places, text in tags (<script>js</script>) and text in tag | |
# attributes (<span data-if="js">). We call on babel to help with | |
# both. | |
for tag_info in extractor.javascript_nodes(): | |
js_text = extractor.text[tag_info.startpos:tag_info.endpos] | |
# Babel will report line-numbers starting from 1, but really | |
# they should be starting from tag_info.startline. | |
_extract_javascript(filename, js_text, tag_info.startline, | |
matches, comments) | |
for (tag_info, js_text) in extractor.javascript_attribute_values(): | |
# tag_info.startline may not be exactly right, since the | |
# attribute may not be on the same line as the start of the | |
# tag, but it's hopefully close enough to be useful. | |
_extract_javascript(filename, js_text, tag_info.startline, | |
matches, comments) | |
def _filename_to_url(filename): | |
"""Convert an exercise filename into a khan academy url.""" | |
# Get the file name of the exercise, to generate a URL reference | |
basename = os.path.basename(filename) | |
name = os.path.splitext(basename)[0] | |
return _EXERCISE_URL % name | |
class _SetEncoder(json.JSONEncoder): | |
"""Encode set data structures as lists in JSON encoding. | |
From: http://stackoverflow.com/a/8230505/6524 | |
""" | |
def default(self, obj): | |
if isinstance(obj, set): | |
return list(obj) | |
return json.JSONEncoder.default(self, obj) | |
def babel_extract(fileobj, keywords, comment_tags, options): | |
"""Babel extraction method for exercises templates. | |
Arguments: | |
fileobj: the file-like object the messages should be extracted from, | |
in this case a single exercise file. | |
keywords: a list of keywords (i.e. function names) that should be | |
recognized as translation functions. Ignored. | |
comment_tags: a list of translator tags to search for and include | |
in the results. Ignored. | |
options: a dictionary of additional options (optional) | |
Returns: | |
An iterator over (lineno, funcname, message, comments) tuples. | |
""" | |
filename = fileobj.name | |
for (nl_text, comments, occurrences) in extract_files([filename], | |
verbose=False): | |
line_numbers = set(o[1] for o in occurrences) | |
for line_number in line_numbers: | |
if isinstance(nl_text, basestring): | |
yield (line_number, '_', nl_text, comments) | |
else: | |
yield (line_number, 'ngettext', nl_text, comments) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Holds tools needed to translate assessment items.""" | |
import copy | |
import json | |
import re | |
import api.jsonify | |
import assessment_items.models | |
from intl import i18n | |
import intl.regexps | |
import intl.request | |
# Matches a number inside perseus text with thousands group separator and | |
# decimal point. ie 9,100.3 | |
# Inside $...$ though the commas should have surrounding curly braces. ie. | |
# $9{,}100.3$ | |
_PERSEUS_NUMBER_RE = re.compile('\d+((\{?,\}?)\d{3})*\.?\d*', re.UNICODE) | |
# After we detect a perseus number string we need to remove the curly braces | |
# and the comma if we are to cast it to a float | |
_BAD_FLOAT_CHARS_RE = re.compile('[\{\},]', re.UNICODE) | |
# In order to detect whether we are inside of outside of $...$ we need to first | |
# find out how many non-escaped dollar signs come before us. "I owe \$5,000" | |
# would not match anything since the dollar sign is escaped. Theoretically the | |
# slash before hand could also be escaped ie. "\\$5{,}000$" in which case the | |
# dollar sign is for real. For completeness we check for that too. | |
_NON_ESCAPED_DOLLAR_SIGN_RE = re.compile(r"(?<!\\)(?:\\\\)*\$", re.UNICODE) | |
def _translate_number(match): | |
"""Translate a number in perseus text to match request language's # format. | |
Different langauges use different decimal and group separators. 1,234.56 | |
in English is translated to 1.234,56 in Spanish. If the number appears | |
within TeX then curly braces are added to surround the comma. | |
This function will preserve the number of decimal places shown in the | |
English version. so 5.00 is translated to 5,00 in Spanish. | |
""" | |
# TODO(james): Merge this with a similar function in download_i18n.py | |
# Get the default number pattern for the current language | |
decimal_format = i18n.request_language_decimal_format() | |
if not decimal_format: | |
# Fake languages (ie. boxes & accents) won't have a decimal format. | |
# So we leave the number untranslated | |
# TODO(james): consider translating these on the fly as well | |
return match.group(0) | |
# We need to remove any curly braces or commas from the match if we are to | |
# be able to cast this number_string to a float. | |
number_string = _BAD_FLOAT_CHARS_RE.sub("", match.group(0)) | |
# Find the precision of the fractional part of the number | |
# ie. for 1.240 -> 3 | |
frac_prec = 0 | |
decimal_pos = number_string.find(".") | |
if decimal_pos >= 0: | |
frac_prec = len(number_string) - (decimal_pos + 1) | |
# Change the decimal format precision (min, max) so they both match | |
# the precision of the fractional part in the English version to ensure we | |
# return a string with the same precision. Since we're changing | |
# global state we make a copy first. | |
decimal_format = copy.copy(decimal_format) | |
decimal_format.frac_prec = (frac_prec, frac_prec) | |
# TODO(james): format_decimal rounds numbers hence "$5.00" -> "$5" | |
# Figure out how to not get it to round, or to add significant figures | |
# back in afterwards | |
translated_number = unicode(i18n.format_decimal(float(number_string), | |
decimal_format)) | |
# We want to add {} around the number if it is inside $..$ but not if it is | |
# outside. Unlike where we fully parse the text in download_i18n.py | |
# automatic number translation, since we have no alpha chachters we can | |
# assume we are inside $...$ if there are an odd number of non-escaped $s | |
# before the match. | |
num_dollar_signs = len(_NON_ESCAPED_DOLLAR_SIGN_RE.findall( | |
match.string[:match.start()])) | |
if num_dollar_signs % 2 == 1: | |
return translated_number.replace(",", "{,}") | |
return translated_number | |
def translate_serialized_assessment_item(item): | |
mo_locale = intl.request.locale_for_mo() | |
if mo_locale == "en": | |
# If we are in English don't bother translating as this was the | |
# language the content was written in. | |
return item | |
# keep track of which parts have been translated | |
translatable_parts = [] | |
# crowdin's jipt (just in place translation) locale used on | |
# translate.khanacademy.org | |
jipt_locale = intl.request.jipt_locale_for_mo() | |
def _maybe_translate(part, attr_or_index): | |
"""Translate in place all natural language text. | |
This replaces in memory | |
the value of attr in the dict part or | |
the value in index within a list part | |
with its translation. | |
""" | |
text = part[attr_or_index] | |
if text and not intl.regexps.NO_NEED_TO_TRANSLATE.match(text): | |
part[attr_or_index] = i18n._(text) # @Nolint - it is ok to be | |
# translating a variable here. | |
if jipt_locale and "crwdns" not in part[attr_or_index]: | |
# We are on translate.ka.org but the string doesn't seem to | |
# have the jipt tags, so we add a warning message to the user, | |
# and we don't count this as a translatable part. | |
part[attr_or_index] = ("$\\large{\\red{\\text{The following " | |
"content is not yet on crowdin, check " | |
"back in a week}}}$\n\n%s" % | |
part[attr_or_index]) | |
else: | |
translatable_parts.append(text) | |
elif text: | |
# No alpha characters, but lets see if it has any numbers in it | |
# that we could automatically translate. | |
part[attr_or_index] = _PERSEUS_NUMBER_RE.sub(_translate_number, | |
text) | |
item_data = json.loads(item["item_data"]) | |
assessment_items.models.AssessmentItem.traverse_natural_language_parts( | |
item_data, _maybe_translate) | |
# If we are on translate.khanacademy.org we want to calculate the | |
# percentage based upon crowdins jipt (Just in place translation) locale | |
check_translated_locale = jipt_locale if jipt_locale else mo_locale | |
if translatable_parts: | |
parts_translated = sum( | |
[i18n.has_translation(check_translated_locale, text) | |
for text in translatable_parts]) | |
item["percent_translated"] = ( | |
parts_translated / float(len(translatable_parts))) * 100 | |
else: | |
# If there is nothing to translate in this part we assume it is 100% | |
# translated | |
item["percent_translated"] = 100.0 | |
item["item_data"] = json.dumps(item_data) | |
return item | |
def is_fully_translated(assessment_item): | |
"""Determine if an assessment item is fully translated in request locale""" | |
serialized_item = api.jsonify.as_serializable(assessment_item) | |
translated_item = translate_serialized_assessment_item(serialized_item) | |
return translated_item.get("percent_translated", 100.0) == 100.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Holds AssessmentItem and related entities. | |
AssessmentItem: database entity about a single assessment item that can be | |
included in exercises, and other things in the future | |
AssessmentItemVersion: database entity about a single version of an assessment | |
item for which it is a child | |
""" | |
import os | |
from google.appengine.ext import db | |
from google.appengine.ext import ndb | |
import api.jsonify | |
import backup_model | |
import compat_key | |
import content | |
import datetime | |
import db_util | |
import layer_cache | |
import setting_model | |
@db_util.disable_ndb_memcache | |
class AssessmentItemTag(backup_model.BackupModelNDB): | |
_serialize_whitelist = ['id', 'description', 'display_name'] | |
description = ndb.StringProperty() | |
display_name = ndb.StringProperty() | |
deleted = ndb.BooleanProperty(indexed=True, default=False) | |
@property | |
def id(self): | |
return self.key.urlsafe() | |
def validate(self): | |
# TODO(cbhl): Prevent duplicate tags from being created. | |
pass | |
@staticmethod | |
def create(**kwargs): | |
"""Create and return a new tag, without putting it to the datastore.""" | |
# Keep all the tags in the same entity group to get strong consistency | |
item = AssessmentItemTag(parent=ndb.Key(AssessmentItemTag, '0'), | |
**kwargs) | |
item.validate() | |
return item | |
@staticmethod | |
def query_all(): | |
return AssessmentItemTag.query(AssessmentItemTag.deleted == False, | |
ancestor=ndb.Key(AssessmentItemTag, '0')) | |
@staticmethod | |
@layer_cache.cache_with_key_params_fxn( | |
lambda: setting_model.Setting.cached_assessment_item_tags_date, | |
cache_separately_per_display_language=False) | |
def fetch_all(): | |
return AssessmentItemTag.query_all().fetch() | |
@staticmethod | |
def find_by_display_name(display_name): | |
# TODO(alpert): Filter by deleted here? | |
return AssessmentItemTag.query( | |
AssessmentItemTag.display_name == display_name, | |
ancestor=ndb.Key(AssessmentItemTag, '0')).get() | |
class BaseAssessmentItem(object): | |
content_kind = "AssessmentItem" | |
content_kind_code = "i" | |
item_data = ndb.StringProperty(indexed=False) | |
author_names = ndb.StringProperty(indexed=False, repeated=True) | |
created_by = ndb.KeyProperty(indexed=True) | |
tags = ndb.KeyProperty(indexed=True, repeated=True, kind=AssessmentItemTag) | |
# Name used to refer to this item in the perseus interface | |
name = ndb.StringProperty(indexed=False) | |
@staticmethod | |
def random_id(): | |
"""Generate a random unique identifier for a future new entity. | |
We override the implementation from BaseRevision because we expect to | |
have many thousands of items and so the probability of a collision with | |
32-bit keys is significant.""" | |
return 'x' + os.urandom(8).encode('hex') | |
@staticmethod | |
def traverse_item_data(item_data, handle_content, handle_widget): | |
# We parse item_data using the schema at: | |
# https://gist.github.com/alopatin/96d923d57bcfb7e8bf4e | |
# If you change this code make sure to change the corresponding | |
# translating code in api/v1_assessment_items.py | |
handle_content(item_data.get("question", {})) | |
for hint in item_data.get("hints", []): | |
handle_content(hint) | |
answer = item_data.get("answerArea", {}) | |
if answer.get("type") == "multiple": | |
handle_content(answer.get("options", {})) | |
else: | |
handle_widget(answer) | |
@staticmethod | |
def traverse_natural_language_parts(item_data, handler): | |
"""Iterate over all the natural language parts in item_data | |
This will call handler(part, attr_or_index), where the part will either | |
be a dict and the second arg the key to the natural language within | |
that dict, or part will be a list and second arg the index to the | |
natural language part within the list. This allows the handler | |
function to change the part[attr_or_index] changing its value in memory | |
We parse the item_data using the schema at: | |
https://gist.github.com/alopatin/96d923d57bcfb7e8bf4e | |
If we change that schema, by say adding a new widget, we will need to | |
update this function. | |
""" | |
def handle_list_items(dict_part, attr): | |
part = dict_part.get(attr, []) | |
for index in xrange(0, len(part)): | |
handler(part, index) | |
def handle_content_in_list_items(dict_part, attr): | |
parts = dict_part.get(attr, []) | |
for part in parts: | |
if part.get("content"): | |
handler(part, "content") | |
def handle_widget(widget): | |
options = widget.get("options", {}) | |
if widget.get("type") in ["radio", "dropdown"]: | |
handle_content_in_list_items(options, "choices") | |
elif widget.get("type") == "categorization": | |
handle_content_in_list_items(options, "items") | |
handle_list_items(options, "categoryHeaders") | |
elif widget.get("type") == "plotter": | |
handle_list_items(options, "labels") | |
handle_list_items(options, "categories") | |
elif widget.get("type") == "orderer": | |
handle_content_in_list_items(options, "options") | |
handle_content_in_list_items(options, "correctOptions") | |
handle_content_in_list_items(options, "otherOptions") | |
elif widget.get("type") == "matcher": | |
handle_list_items(options, "labels") | |
handle_list_items(options, "left") | |
handle_list_items(options, "right") | |
elif widget.get("type") == "sorter": | |
handle_list_items(options, "correct") | |
elif widget.get("type") == "image": | |
handle_content_in_list_items(options, "labels") | |
def handle_renderer(renderer): | |
if renderer.get("content"): | |
handler(renderer, "content") | |
for widget in renderer.get("widgets", {}).itervalues(): | |
handle_widget(widget) | |
AssessmentItem.traverse_item_data( | |
item_data, | |
lambda x: handle_renderer(x), | |
lambda x: handle_widget(x)) | |
def preview_relative_url(self): | |
return "/preview/content/items/%s" % self.key.id() | |
def preview_relative_url_in_exercise(self, exercise): | |
"""Return a url that contains all items in exercise jumping to this one | |
This presumes that the item is in the exercise passed in. If it is not | |
that it will just go to the first assessment item in that exercise. | |
""" | |
return "/preview/content/items?exercises=%s#%s" % (exercise.name, | |
self.key.id()) | |
@db_util.disable_ndb_memcache | |
class AssessmentItem(BaseAssessmentItem, | |
content.models.VersionedContentNDB): | |
"""Information about a single assessment item.""" | |
# Metaclass is needed to allow inheriting DB properties from | |
# BaseAssessmentItem | |
__metaclass__ = db_util.NDBInheritModelPropertiesType | |
_serialize_whitelist = [ | |
'id', 'creation_date', 'item_data', 'author_names', | |
'tags', 'sha', 'name'] | |
edit_revision = ndb.KeyProperty(kind='AssessmentItemRevision', | |
indexed=False) | |
is_dirty = ndb.ComputedProperty(lambda self: | |
self.edit_revision and self.edit_revision.id() != self.sha) | |
def get_edit_revision(self): | |
return self.get_edit_revision_async().get_result() | |
def get_edit_revision_async(self): | |
if self.edit_revision: | |
return self.edit_revision.get_async() | |
else: | |
return self.get_current_revision_async() | |
def get_revisions(self): | |
return (AssessmentItemRevision.query() | |
.filter(AssessmentItemRevision.content_id == self.id).fetch()) | |
@staticmethod | |
def create_sorted_revision_list(revisions): | |
def date_sort(revision): | |
return revision.creation_date or datetime.datetime.min | |
sorted_revisions = sorted(revisions, key=date_sort, reverse=True) | |
# Fetch revision creators | |
created_by_keys = set() | |
for revision in revisions: | |
if revision.created_by: | |
created_by_keys.add(compat_key.from_(revision.created_by).db) | |
created_by = {user.key(): user for user in | |
db.get(list(created_by_keys))} | |
def get_created_by(revision): | |
if revision.created_by: | |
creator = created_by[compat_key.from_(revision.created_by).db] | |
return creator.nickname | |
else: | |
return "" | |
def get_creation_date(revision): | |
if revision.creation_date: | |
return revision.creation_date.date().isoformat() | |
else: | |
return "" | |
return [{ | |
"created_by": get_created_by(revision), | |
"sha": revision.sha, | |
"creation_date": get_creation_date(revision), | |
} for revision in sorted_revisions] | |
@db_util.disable_ndb_memcache | |
class AssessmentItemRevision(BaseAssessmentItem, | |
content.models.BaseRevisionNDB): | |
"""Information about a single version of an assessment item.""" | |
# Metaclass is needed to allow inheriting DB properties from | |
# BaseAssessmentItem | |
__metaclass__ = db_util.NDBInheritModelPropertiesType | |
@classmethod | |
def fixup_after_deserialize(cls, properties): | |
super(AssessmentItemRevision, cls).fixup_after_deserialize(properties) | |
# Allow item_data to be passed up as a JSON object or string | |
# TODO(joel) - potentially remove this in the future | |
if ("item_data" in properties and | |
type(properties["item_data"]) == dict): | |
properties["item_data"] = api.jsonify.jsonify( | |
properties["item_data"]) | |
# Convert string keys to actual keys | |
if "tags" in properties: | |
if properties["tags"] is not None: | |
properties["tags"] = [compat_key.from_(key).ndb for key in | |
properties["tags"]] | |
else: | |
properties["tags"] = [] | |
# Because AssessmentItems are never published, getting | |
# AssessmentItemRevisions should be made convenient. | |
@staticmethod | |
def get_by_sha(sha): | |
return content.models.BaseRevisionNDB.get_by_sha("AssessmentItem", sha) | |
@staticmethod | |
def get_by_sha_list(sha_list): | |
key_list = [ndb.Key("AssessmentItemRevision", sha) for sha in sha_list] | |
return compat_key.maybe_get_multi(key_list) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""A Compile object (see compile_rule.py): translates khan-exercises files.""" | |
import re | |
from kake import translate_javascript | |
from kake import translate_util | |
from kake.lib import compile_rule | |
class TranslateExercises(translate_util.TranslateBase): | |
"""Class for translating natural-language text in khan-exercises files.""" | |
_DATA_IF_RE = re.compile(r"data-if\s*=\s*('[^']*'|\"[^\"]*\"|[^'\">\s]+)") | |
def version(self): | |
"""Update every time build() changes in a way that affects output.""" | |
return 1 | |
def _ngettext_html(self, text, tag_info, lang, translation): | |
"""Creates 'translated' html for an ngettext-like span. | |
In exercises html, ngettext is implemented like so: | |
<span data-if="isSingular(KEY)" foo=bar>xxx</span> | |
<span data-else foo=bar>yyy</span> | |
When translating into Polish, with 4 plural forms, we want to emit | |
<span data-if="$.ngetpos(KEY, LANG) === 0" foo=bar>aaa</span> | |
<span data-else-if="$.ngetpos(KEY, LANG) === 1" foo=bar>bbb</span> | |
<span data-else-if="$.ngetpos(KEY, LANG) === 2" foo=bar>ccc</span> | |
<span data-else-if="$.ngetpos(KEY, LANG) === 3" foo=bar>ddd</span> | |
This routine returns a string consisting of the desired output. | |
""" | |
old_starttag = text[tag_info.outer_startpos:tag_info.startpos] | |
old_endtag = text[tag_info.endpos:tag_info.outer_endpos] | |
ngetpos_key = tag_info.is_singular() | |
data_if = self._DATA_IF_RE.search(old_starttag) | |
assert ngetpos_key is not None, tag_info | |
assert data_if, old_starttag | |
assert translate_util.SmallMOFile.is_plural(translation), translation | |
new_text = [] | |
for i in xrange(len(translation)): | |
if i == 0: | |
new_data_if = ('data-if="$.ngetpos(%s, \'%s\') === 0"' | |
% (ngetpos_key, lang)) | |
else: | |
new_data_if = ('data-else-if="$.ngetpos(%s, \'%s\') === %d"' | |
% (ngetpos_key, lang, i)) | |
new_text.extend((old_starttag[:data_if.start()], | |
new_data_if, | |
old_starttag[data_if.end():], | |
translation[i], | |
old_endtag)) | |
return ''.join(new_text) | |
def translate(self, infile_name, outfile_lang_moentries_context): | |
# Import here so kake users who don't actually need the | |
# translate_exercises rule can still import this module | |
# without needing to bring in exercises. | |
import exercises.babel | |
file_contents = self._read_input(infile_name) | |
# Get all the info about the scripts and nltext in this document. | |
extractor = exercises.babel.I18nExtractor() | |
extractor.feed(file_contents) | |
nltext_info = list(extractor.nltext_nodes()) | |
nltext_in_attr_info = list(extractor.nltext_attribute_values()) | |
script_info = list(extractor.javascript_nodes()) | |
script_in_attr_info = list(extractor.javascript_attribute_values()) | |
js_translator = translate_javascript.TranslateJavascript() | |
for (outfile, lang, mo_entries, _) in outfile_lang_moentries_context: | |
# Keep track of the rewrites we're supposed to do. | |
rewrites = [] # each entry is (startpos, endpos, new_text) | |
# Handle javascript in, e.g., "<script>js</script>" | |
for tag_info in script_info: | |
# Use a javascript-injector to handle the javascript | |
# inside the exercises. | |
script_contents = file_contents[tag_info.startpos: | |
tag_info.endpos] | |
babel_output = js_translator.extract_nltext(script_contents) | |
translated_js = js_translator.translate_to_lang( | |
babel_output, script_contents, lang, mo_entries) | |
# Only inject the string if there is a translation | |
if translated_js is not None: | |
rewrites.append((tag_info.startpos, tag_info.endpos, | |
translated_js)) | |
# Handle javascript in, e.g., "<span data-if="js">...</span>" | |
for (tag_info, attrval_string) in script_in_attr_info: | |
# tag_info is the tag-info for the span, attrval_string is js. | |
babel_output = js_translator.extract_nltext(attrval_string) | |
translated_js = js_translator.translate_to_lang( | |
babel_output, attrval_string, lang, mo_entries) | |
if translated_js is not None: | |
# We need to figure out where attrval_string is inside | |
# tag_info. TODO(csilvers): have attrval_pos passed in. | |
attrval_pos = file_contents.index(attrval_string, | |
tag_info.outer_startpos, | |
tag_info.startpos) | |
rewrites.append((attrval_pos, | |
attrval_pos + len(attrval_string), | |
translated_js)) | |
# Handle text. | |
last_was_singular = False # used when we're injecting ngettext | |
for tag_info in nltext_info: | |
if last_was_singular: | |
# For ngettext, we want to replace the English | |
# singular + plural spans with a bunch of | |
# translated spans (one for each plural). We | |
# already deleted the English singular and | |
# inserted the new stuff, so all that's left is to | |
# delete the English plural. | |
rewrites.append((tag_info.outer_startpos, | |
tag_info.outer_endpos, | |
'')) | |
last_was_singular = False | |
continue | |
(cleaned_contents, _) = extractor.cleaned_text(tag_info) | |
if tag_info.is_singular(): | |
# An ngettext call. We replace the English singular | |
# with a bunch of translated spans (one per plural). | |
translation = mo_entries.get_plural_translation( | |
cleaned_contents) | |
if translation: # a translation was found | |
new_text = self._ngettext_html(file_contents, tag_info, | |
lang, translation) | |
rewrites.append((tag_info.outer_startpos, | |
tag_info.outer_endpos, | |
new_text)) | |
last_was_singular = True | |
else: | |
# A gettext call. We can replace the text between | |
# the tags with translated text. | |
translation = mo_entries.get_singular_translation( | |
cleaned_contents) | |
if translation: | |
rewrites.append((tag_info.startpos, tag_info.endpos, | |
translation)) | |
# Handle text in, e.g., "<input value="text">" | |
for (tag_info, attrval_string) in nltext_in_attr_info: | |
# tag_info is the tag-info for the input, attrval is text. | |
translation = mo_entries.get_singular_translation( | |
attrval_string) | |
if translation: | |
# We need to figure out where attrval_string is inside | |
# tag_info. TODO(csilvers): have attrval_pos passed in. | |
attrval_pos = file_contents.index(attrval_string, | |
tag_info.outer_startpos, | |
tag_info.startpos) | |
rewrites.append((attrval_pos, | |
attrval_pos + len(attrval_string), | |
translation)) | |
# Small optimization to catch a case we know output is unchanged. | |
if not rewrites: | |
translated_contents = None | |
else: | |
# Construct the new file from the original file + rewrites. | |
translated_contents = [] | |
rewrites.sort() # get in startpos order | |
i = 0 | |
for (startpos, endpos, new_text) in rewrites: | |
translated_contents.append(file_contents[i:startpos]) | |
translated_contents.append(new_text) | |
i = endpos | |
translated_contents.append(file_contents[i:]) | |
translated_contents = ''.join(translated_contents) | |
if translated_contents == file_contents: | |
translated_contents = None # signal that output == input | |
self._write_output(infile_name, outfile, translated_contents) | |
compile_rule.register_compile( | |
'TRANSLATED KA-EXERCISE', | |
'genfiles/translations/{lang}/khan-exercises/{{path}}.html', | |
['khan-exercises/{{path}}.html', | |
('genfiles/extracted_strings/{lang}/' | |
'khan-exercises/{{path}}.html.small_mo.pickle')], | |
TranslateExercises(), | |
# small_mo.pickle files are recreatd every time {lang}.po files | |
# change, but their contents usually don't change, so crc's are | |
# good for us. | |
compute_crc=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""A Compile object (see compile_rule.py): translates .js files.""" | |
import cStringIO | |
import json | |
import re | |
import third_party.babel.messages.extract | |
import intl.data | |
import intl.english_only | |
from kake import translate_util | |
from kake.lib import compile_rule | |
class TranslateJavascript(translate_util.TranslateBase): | |
"""Class for translating natural-language text in javascript files.""" | |
_JS_GETTEXT_RE = re.compile(r'\$\._|\$\.ngettext|\$_') | |
_BABEL_KW = third_party.babel.messages.extract.DEFAULT_KEYWORDS.copy() | |
# <$_> in jsx expands to $_({varmap}, "string", ...), so kw-index is 2. | |
_BABEL_KW['$_'] = (2,) # used in .jsx files as <$_> | |
def version(self): | |
"""Update every time build() changes in a way that affects output.""" | |
return 1 | |
def extract_nltext(self, file_contents): | |
# Extract the messages from the JavaScript file with the | |
# appropriate start and end positions as well. As a small | |
# efficiency short-cut, we can say that if none of ($._, | |
# $.ngettext, $_) are present in file_contents, there won't be | |
# any translations to do. | |
if not self._JS_GETTEXT_RE.search(file_contents): | |
return [] | |
else: | |
# Convert it to a StringIO buffer for pybabel to handle. | |
# pybabel expects utf-8 encoded input, so that's what we give it. | |
input = cStringIO.StringIO(file_contents.encode('utf-8')) | |
r = third_party.babel.messages.extract.extract_javascript( | |
input, self._BABEL_KW, [], {'messages_only': True}) | |
return list(r) # convert from iterator if need be | |
def translate_to_lang(self, babel_output, file_contents, lang, mo_entries): | |
# Keep track of if the translated file differs from the original. | |
has_diff = False | |
# Go through all of the matched messages in reverse (to avoid | |
# having to deal with the changes in position of the messages). | |
for (messages, start, end) in reversed(babel_output): | |
# Figure out the lookup key. extract_javascript returns a | |
# list if a plural is found. | |
key = messages | |
if isinstance(key, basestring): # singular | |
key = messages | |
translation = mo_entries.get_singular_translation(key) | |
if not translation: | |
continue | |
insert_text = json.dumps(translation) | |
elif messages[0] is None: # jsx-style $_() | |
# For the jsx $_() operator, the gettext string is the | |
# *second* argument (the first argument is the value-dict). | |
key = messages[1] | |
translation = mo_entries.get_singular_translation(key) | |
if not translation: | |
continue | |
insert_text = json.dumps(translation) | |
else: # plural | |
key = messages[0] | |
translation_dict = mo_entries.get_plural_translation(key) | |
if not translation_dict: | |
continue | |
# We need the messages to be sorted by index. | |
index_and_messages = sorted(translation_dict.iteritems()) | |
messages = [message for (_, message) in index_and_messages] | |
# We store the info we need in legal javascript format. | |
insert_text = json.dumps({"lang": lang, "messages": messages}) | |
# Insert the string at the right position. | |
file_contents = ''.join((file_contents[:start], | |
insert_text, | |
file_contents[end:])) | |
# A change was made to the file. | |
has_diff = True | |
if has_diff: | |
return file_contents | |
else: | |
return None # signals that output == input | |
def translate(self, infile_name, outfile_lang_moentries_context): | |
if intl.english_only.should_not_translate(infile_name): | |
# If we shouldn't translate it, we can just symlink it! | |
for (outfile, _, _, _) in outfile_lang_moentries_context: | |
self._write_output(infile_name, outfile, None) | |
return | |
file_contents = self._read_input(infile_name) | |
babel_output = self.extract_nltext(file_contents) | |
for (outfile, lang, mo_entries, _) in outfile_lang_moentries_context: | |
# Get the translation, or None if output == input. | |
translated_contents = self.translate_to_lang( | |
babel_output, file_contents, lang, mo_entries) | |
self._write_output(infile_name, outfile, translated_contents) | |
# These rules are only used in dev (where we don't compress js), and | |
# for js worker files, which we likewise translate without compressing. | |
compile_rule.register_compile( | |
'TRANSLATED RAW JS FILES', | |
'genfiles/translations/{lang}/{{path}}.js', | |
['{{path}}.js', | |
'genfiles/extracted_strings/{lang}/{{path}}.js.small_mo.pickle'], | |
TranslateJavascript(), | |
# small_mo.pickle files are recreatd every time {lang}.po files | |
# change, but their contents usually don't change, so crc's are | |
# good for us. | |
compute_crc=True) | |
# This catches files that are compiled (or transpiled) into js. | |
dirs_with_js = ('genfiles/compiled_{type}', | |
# This is a special case (bundle.js has its own directory) | |
'genfiles/khan-exercises', | |
) | |
for d in dirs_with_js: | |
translate_util.register_translatesafe_compile( | |
'TRANSLATED JS FILES (%s)' % d, | |
'%s/{lang}/{{path}}.js' % d, | |
['%s/en/{{path}}.js' % d, | |
('genfiles/extracted_strings/{lang}/' | |
'%s/{lang}/{{path}}.js.small_mo.pickle' % d)], | |
TranslateJavascript(), | |
compute_crc=True) | |
# This is the rule used in prod, where we only translate javascript | |
# after it's been compressed. The exception is handlebars files, | |
# which are translated before they're even converted to javascript, in | |
# compile_handlebars.py. Luckily, the special-case rule for the | |
# handlebars files (in compress_js.py) has higher precedence than this | |
# rule, so we can be fully general here. | |
# We use 'trumped_by' to make sure this rule doesn't apply when lang=en, | |
# and also to make sure this rule doesn't apply when translating handlebars. | |
translate_util.register_translatesafe_compile( | |
'TRANSLATED COMPRESSED JS FILES', | |
'genfiles/compressed_javascript/{lang}/{{path}}.min.js', | |
['genfiles/compressed_javascript/en/{{path}}.min.js', | |
('genfiles/extracted_strings/{lang}/genfiles/compressed_javascript/{lang}' | |
'/{{path}}.min.js.small_mo.pickle')], | |
TranslateJavascript(), | |
trumped_by=['COMPRESSED JS', 'COMPRESSED TRANSLATED HANDLEBARS JS'], | |
compute_crc=True, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment