Skip to content

Instantly share code, notes, and snippets.

@aronasorman
Created March 30, 2015 18:53
Show Gist options
  • Save aronasorman/e8f77ad7444d55c5a5e1 to your computer and use it in GitHub Desktop.
Save aronasorman/e8f77ad7444d55c5a5e1 to your computer and use it in GitHub Desktop.
perseus i18n files from KA
#!/usr/bin/env python
"""Extracts translatable strings from HTML exercise files.
This program is used for extracting translatable strings from
exercise files and optionally outputting a PO or JSON file to be used
for further translation.
How it works: The script goes through the HTML files and attempts
to locate all nodes that have text as a direct child node. It then
extracts the HTML contents of that node and returns it as a translatable
string. Note that certain nodes are excluded from this list as they
contain non-translatable text (see _IGNORE_NODES). It is assumed that
everything that isn't part of _IGNORE_NODES is something that needs to be
translated.
We also call out to babel to extract javascript inside exercises (in
<script> tags, but also in some other places where javascript is
allowed in exercise html files).
"""
import cStringIO
import HTMLParser
import argparse
import json
import os.path
import re
import sys
# Make sure we can import third_party even when run from the commandline.
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import third_party.babel.messages.extract
from third_party import i18nize_templates
from third_party import polib
# All the tags that we want to treat as javascript. We'll call the
# javascript extractor on text in these tags, to extract out the
# natural language text from javascript.
_JAVASCRIPT_TAGS = frozenset((
'script',
'var',
))
# We also extract javascript from the innerhtml of tags that contain a
# given attribute/value pair. We consider a tag to hold javascript if
# it has the given attribute name and *CONTAINS* the given value text
# inside its attribute value, so '<span class="foo-guess-bar">' matches.
_JAVASCRIPT_ATTRVALS = frozenset((
('class', 'graphie'),
('class', 'guess'),
('class', 'validator-function'),
))
# TODO(csilvers): Convert to a more efficient data structure.
_JAVASCRIPT_ATTRVAL_ATTRS = frozenset((attr for (attr, _) in
_JAVASCRIPT_ATTRVALS))
# Unlike _JAVASCRIPT_ATTRVALS, which says a tag contains javascript if
# it has a given attrval, this says that the attribute value *itself*
# is javascript. For instance <span data-if="<some javascript>">.
_JAVASCRIPT_ATTRS = frozenset((
'data-choices',
'data-else',
'data-ensure',
'data-if',
'data-if-else',
))
# All the tags that we want to ignore and not extract strings from.
_IGNORE_TAGS = frozenset((
'code',
'style',
))
# Ignore all tags with this attribute name/value pair. We ignore a
# tag if it has the given attribute name and *CONTAINS* the given
# value text inside its attribute value, so
# '<span data-type="a-regexp">' matches.
_IGNORE_ATTRVALS = frozenset((
('data-type', 'regex'),
))
# TODO(csilvers): Convert to a more efficient data structure.
_IGNORE_ATTRVAL_ATTRS = frozenset((attr for (attr, _) in _IGNORE_ATTRVALS))
# The base URL for referencing an exercise
_EXERCISE_URL = 'http://www.khanacademy.org/exercise/%s'
class UnmatchedEndTagError(Exception):
def __init__(self, tagname, linenum, colnum):
super(UnmatchedEndTagError, self).__init__()
self.linenum = linenum
self.colnum = colnum
self.tagname = tagname
def __str__(self):
# Not super-useful since we don't know the filename!
return ('No start tag found for end-tag </%s> at line %s, col %s'
% (self.tagname, self.linenum, self.colnum))
def main():
"""Handle running this program from the command-line."""
# Handle parsing the program arguments
arg_parser = argparse.ArgumentParser(
description='Extract translatable strings from HTML exercise files.')
arg_parser.add_argument('html_files', nargs='+',
help='The HTML exercise files to extract strings from.')
arg_parser.add_argument('--output', dest='output',
help='The file to write the output to.')
arg_parser.add_argument('--format', choices=['po', 'json'],
dest='format', default='po',
help='The format of the output. (default: %(default)s)')
arg_parser.add_argument('--quiet', action='store_true',
help='Do not emit status to stderr on successful runs.')
args = arg_parser.parse_args()
if args.format == 'po':
# Output a PO file by default
results = unicode(make_potfile(args.html_files,
not args.quiet)).encode('utf-8')
else:
# Optionally output a JSON-encoded data structure
results = json.dumps(extract_files(args.html_files, not args.quiet),
cls=_SetEncoder, indent=2)
if args.output:
# If an output location is specified, write the output to that file
output_file = open(args.output, 'w')
output_file.write(results)
output_file.close()
else:
# Otherwise just write the output to STDOUT
print results
def make_potfile(files, verbose):
"""Generate a PO file from a collection of HTML files.
Returns the string representing the PO file.
"""
# Turn off line-wrapping: it can mess with html markup inside PO comments.
output_pot = polib.POFile(wrapwidth=sys.maxint, encoding='utf-8')
matches = extract_files(files, verbose)
for (nl_text, comments, occurrences) in matches:
# Build the PO entry and add it to the PO file
# If nltext is a tuple, it means we have a plural (ngettext) entry
if isinstance(nl_text, basestring):
(msgid, msgid_plural) = (unicode(nl_text), None)
(msgstr, msgstr_plural) = ("", None)
else:
(msgid, msgid_plural) = (unicode(nl_text[0]), unicode(nl_text[1]))
(msgstr, msgstr_plural) = (None, {0: u"", 1: u""})
output_pot.append(polib.POEntry(
msgid=msgid,
msgid_plural=msgid_plural,
msgstr=msgstr,
msgstr_plural=msgstr_plural,
comment='\n'.join(comments),
occurrences=occurrences,
))
return output_pot
def extract_files(files, verbose):
"""Extract a collection of translatable strings from a set of HTML files.
Returns:
A list of natural language texts and their occurrences:
[(nl-text, (comment, ...),
((1st-file, 1st-linenum), (2nd-file, 2nd-linenum), ...)),
...
]
For each nl-text, the (file, linenum) pairs are sorted in
lexicographic order (first by filename, then by line-number).
The list of natural-language texts is sorted by the (1st-file,
1st-linenum), to maximize the chances texts from the same file
will sort together.
"""
matches = {}
comments = {}
# Go through all the exercise files.
for filename in files:
if verbose:
print >>sys.stderr, 'Extracting strings from: %s' % filename
extract_file(filename, matches, comments)
if verbose:
num_matches = len(matches)
print >>sys.stderr, ('%s string%s extracted.'
% (num_matches, "" if num_matches == 1 else "s"))
# Get the matches into the return format.
retval = []
for (nl_text, occurrences) in matches.iteritems():
occurrences = sorted(occurrences)
# Get the file name of the exercise, to generate a URL reference
first_filename = occurrences[0][0]
msg_comments = list(comments.get(nl_text, []))
msg_comments.append(
unicode('Text is in <%s>' % _filename_to_url(first_filename)))
retval.append((nl_text, msg_comments, occurrences))
# Now sort the nl-texts so they come in order of their first occurrence.
# We break ties -- should be rare -- arbitrarily.
retval.sort(key=lambda (nl_text, comments, occ): (occ[0], nl_text))
return retval
class I18nExtractor(HTMLParser.HTMLParser):
"""Lexes html, and returns interesting tags via get_node().
'Interesting' tags are those that a) have text directly in them
(that is, not inside some nested tag), b) are not in _IGNORE_TAGS
or _IGNORE_ATTRVALS, and c) are not nested inside another
'interesting' tag. The data returned is the full html of that
tag: that is, everything between <tag> and </endtag>
(non-inclusive). Whitespace in the return value is collapsed.
This assumes well-formed html! It will do weird things otherwise.
NOTE: This class is used by webapp:kake/translate_exercises.py,
so be careful about code there if you refactor here.
"""
class TagInfo(object):
def __init__(self, tagname, attrs,
startline, startpos, outer_startpos, parent_tag_info):
self.tagname = tagname
self.attrs = attrs
self.should_emit_tag = True # start optimistic
self.tag_has_non_whitespace = False # start pessimistic
self.startline = startline # line # into text
self.startpos = startpos # offset into text
self.endpos = None # will point after text
self.outer_startpos = outer_startpos # offset into tag
self.outer_endpos = None # will point after close-tag
# We know right away we shouldn't emit a tag if any of the
# following are true: we shouldn't emit the tag's parent,
# the tag is in _IGNORE_TAGS, or the tag has some
# attribute values that are in _IGNORE_ATTRVALS.
if self.should_emit_tag:
self.should_emit_tag = (not parent_tag_info or
parent_tag_info.should_emit_tag)
if self.should_emit_tag:
self.should_emit_tag = not self._tag_matches(
_IGNORE_TAGS, _IGNORE_ATTRVAL_ATTRS, _IGNORE_ATTRVALS)
_IS_SINGULAR_RE = re.compile('^isSingular\((.*)\)$')
def _tag_matches(self, tagnames, attrs, attrvals):
"""True if our tag is in tagnames or an attr/val is in attrvals."""
if self.tagname in tagnames:
return True
for attrval in self.attrs:
if attrval[0] in attrs:
for (match_attr, match_tag) in attrvals:
if (match_attr == attrval[0] and
match_tag in attrval[1]): # *CONTAINS*
return True
return False
def is_javascript_tag(self):
"""True if this tag contains javascript in its innerhtml."""
return self._tag_matches(_JAVASCRIPT_TAGS,
_JAVASCRIPT_ATTRVAL_ATTRS,
_JAVASCRIPT_ATTRVALS)
def nltext_attrvals(self):
"""Return attribute values for value/etc attributes in tag."""
# We re-use the code in i18nize_templates which already
# finds natural language text in tag attributes.
return [self.attrs[i][1]
for i in i18nize_templates.natural_language_attributes(
self.tagname, self.attrs)]
def javascript_attrvals(self):
"""Return attribute values for data-if/etc attributes in tag."""
return [val for (attr, val) in self.attrs
if attr in _JAVASCRIPT_ATTRS]
def is_singular(self):
"""foo if this tag has an data-if="isSingular(foo)", else None."""
for (attr, val) in self.attrs:
if attr == 'data-if':
m = self._IS_SINGULAR_RE.match(val)
if m:
return m.group(1)
return None
def __init__(self, *args, **kwargs):
HTMLParser.HTMLParser.__init__(self, *args, **kwargs)
self.tagstack = [] # stack (list) of TagInfo's.
self.candidates = [] # tags that we provisionally should emit
self.script_nodes = [] # TagInfos for js content in this html
self.script_attrvals = [] # (TagInfo, attr-val) where attrval is js
self.nltext_attrvals = [] # (TagInfo, attr-val) where attrval is text
def _line_offset_to_pos(self, line_and_offset):
"""Input is a tuple (5, 10): 10th char of the 5th line."""
return self.linepos[line_and_offset[0]] + line_and_offset[1]
# Older versions of HTMLParser (2.7.1, at least) have a bug where
# they end <script> tags on *any* </tag>, not just </script>. Fix
# that. Later versions of Python fix the bug but add a new arg
# to set_cdata_mode(), so we have to handle that too.
def set_cdata_mode(self, *args):
self.interesting = re.compile(r'</%s' % re.escape(self.lasttag))
# Sadly, there's no regexp pattern that just matches unicode
# alphabetic characters, so we do the inverse: non-alnums + digits
# = non-alphabetic.
_NO_ALPHA_RE = re.compile('^[\W\d]+$', re.UNICODE)
def handle_starttag(self, tag, attrs):
# Read past the start-tag; startpos is the start of the 'inner' html.
startline = self.getpos()[0]
outer_startpos = self._line_offset_to_pos(self.getpos())
startpos = outer_startpos + len(self.get_starttag_text())
taginfo = I18nExtractor.TagInfo(
tag, attrs, startline, startpos, outer_startpos,
self.tagstack[-1] if self.tagstack else None)
# Sometimes tags have attributes whose values are natural
# language text (e.g. <input value="some text">). If so,
# store that fact.
for nltext_val in taginfo.nltext_attrvals():
if nltext_val and not self._NO_ALPHA_RE.match(nltext_val):
self.nltext_attrvals.append((taginfo, nltext_val))
# Sometimes tags have attributes whose values are javascript
# (e.g. <span data-if="some_javascript_code">). If so, store
# that fact.
for js_val in taginfo.javascript_attrvals():
if js_val:
self.script_attrvals.append((taginfo, js_val))
self.tagstack.append(taginfo)
def handle_endtag(self, tag):
# We need the while because not all tags have end-tags (e.g. <meta>)
while self.tagstack and self.tagstack[-1].tagname != tag:
self.tagstack.pop()
# This can fail if the html is not well-formed (no balanced tags)
if not self.tagstack:
(linenum, colnum) = self.getpos()
raise UnmatchedEndTagError(tag, linenum, colnum)
tag_info = self.tagstack.pop()
# Update endpos
tag_info.endpos = self._line_offset_to_pos(self.getpos())
# outer_endpos points after the end of this tag. HTMLParser
# doesn't expose this info, so we depend on the fact end-tags
# can't have tag-attrs, so searching for '>' is good enough.
tag_info.outer_endpos = self.text.index('>', tag_info.endpos) + 1
# If the tag's innerhtml is javascript, add it to our list of
# javascript nodes. Else if tagname-is-good is set, and the
# tag contains non-ws text, then its contents are a candidate
# to be extracted. However, its parents get dibs: we don't
# extract this if we extract a parent. We will have to wait
# until the parent is done, to see.
if tag_info.is_javascript_tag():
self.script_nodes.append(tag_info)
elif tag_info.should_emit_tag and tag_info.tag_has_non_whitespace:
self.candidates.append(tag_info)
def handle_data(self, data):
"""Callback for text between tags."""
if data.strip(): # not just whitespace
assert self.tagstack
self.tagstack[-1].tag_has_non_whitespace = True
def handle_charref(self, charref):
"""Callback for data that starts with &, e.g. &apos;."""
assert self.tagstack
self.tagstack[-1].tag_has_non_whitespace = True
def feed(self, text):
"""Store the text so we can print from it, and make line->pos table."""
self.text = text
self.linepos = [None, 0] # dummy 0-th line; linenums start at 1
while True:
newline = text.find('\n', self.linepos[-1])
if newline == -1:
break
self.linepos.append(newline + 1)
HTMLParser.HTMLParser.feed(self, text)
def cleaned_text(self, tag_info):
"""Return text between <tag> and </tag>, cleans up whitespaces.
Get rid of leading and trailing whitespace, and collapse runs of
whitespace to a single whitespace (changing newline to space).
Returns:
The cleaned text, and the line-number that the cleaned
text starts on.
"""
text = self.text[tag_info.startpos:tag_info.endpos]
# Figure out the line number of the first non-whitespace char.
line_number = tag_info.startline
leading_whitespace = text[:len(text) - len(text.lstrip())]
line_number += leading_whitespace.count('\n')
# Normalize whitespace and return.
return (re.sub(r'\s+', ' ', text).strip(), line_number)
def nltext_nodes(self):
"""Yields TagInfo objects representing nodes with nl-text in them."""
# If one candidate is inside another one, we print the outside
# one. We can figure this out via sorting.
self.candidates.sort(key=lambda tag_info: tag_info.startpos)
if self.candidates:
yield self.candidates[0]
parent_range = (self.candidates[0].startpos,
self.candidates[0].endpos)
for i in xrange(1, len(self.candidates)):
# If we're entirely inside our parent, ignore us.
if (self.candidates[i].startpos >= parent_range[0] and
self.candidates[i].endpos <= parent_range[1]):
continue
yield self.candidates[i]
parent_range = (self.candidates[i].startpos,
self.candidates[i].endpos)
def javascript_nodes(self):
"""Yields TagInfo objects representing nodes with js-text in them."""
for taginfo in self.script_nodes:
yield taginfo
def javascript_attribute_values(self):
"""Yields (TagInfo, attribute_value_string) when attr-value is js."""
for (taginfo, attrval_string) in self.script_attrvals:
yield (taginfo, attrval_string)
def nltext_attribute_values(self):
"""Yields (TagInfo, attribute_value_string) when attr-value is text."""
for (taginfo, attrval_string) in self.nltext_attrvals:
yield (taginfo, attrval_string)
_JS_GETTEXT_RE = re.compile(r'\$\._|\$\.ngettext')
def javascript_has_no_i18n_markup(js_text):
"""Return true if js_text does not have $._ or $.ngettext in it.
This is an optimization -- if a quick search for $._ or $.ngettext
fails, then we know we don't need to do the more expensive
javascript tokenization looking for strings to extract. This is
because the only time we have strings to extract in js is in $._()
and $.ngettext().
Note that this can return False even if there's no actual $._ in
the code (for instance, if '$._' appears in a comment. That's ok
though; it just means we don't benefit from the optimization for
this file.
"""
return not _JS_GETTEXT_RE.search(js_text)
def _extract_javascript(filename, js_text, js_start_line, matches, comments):
"""js_start_line: where this javascript starts inside the html file."""
# All i18n markup in javascript is via $._ and $.ngettext, so if
# those aren't present, we can bail, confident in the lack of nltext.
if javascript_has_no_i18n_markup(js_text):
return
for (lineno, funcname, message, msg_comments) in (
third_party.babel.messages.extract.extract_javascript(
cStringIO.StringIO(js_text.encode('utf-8')),
third_party.babel.messages.extract.DEFAULT_KEYWORDS,
['I18N:'],
{})):
# the javascript extractor has a 'feature' where it appends a
# third None argument if an ngettext message has interpolated
# strings ("%(foo)s"). We ignore that.
if isinstance(message, tuple):
message = message[:2]
matches.setdefault(message, set()).add((filename,
js_start_line - 1 + lineno))
comments.setdefault(message, []).extend(msg_comments)
def extract_file(filename, matches, comments, contents=None):
"""Extract a collection of translatable strings from an HTML file.
This function modifies matches and comments in place with new
content that it discovers.
Arguments:
filename: the .html file to extract natural language text from.
matches: a dict from found nl-strings to a set of
(filename, linenumber) pairs where this string is found.
comments: a dict from found nl-strings to a list of
comments about those strings (extracted from the source code).
contents: if not None, the contents of file 'filename'.
"""
if contents is None:
with open(filename) as f:
contents = f.read().decode('utf-8')
extractor = I18nExtractor()
extractor.feed(contents)
singular = None # used when collecting singular + plural for ngettext
singular_occ = None
for tag_info in extractor.nltext_nodes():
(text, linenum) = extractor.cleaned_text(tag_info)
if singular is not None:
# If the last tag was the singular part of an ngettext
# call, we're the plural.
matches.setdefault((singular, text), set()).add(singular_occ)
# TODO(csilvers): extract comments from source and append here.
comments.setdefault((singular, text), []).extend([])
singular = None
singular_occ = None
elif tag_info.is_singular():
# If *we're* the singular part of an ngettext call, store
# that info so the next tag can add us to matches.
singular = text
singular_occ = (filename, linenum)
else:
# Normal, gettext call.
matches.setdefault(text, set()).add((filename, linenum))
# TODO(csilvers): extract comments from source and append here.
comments.setdefault(text, []).extend([])
# We also need to worry about natural language text inside tags,
# such as <input value="natural language text">.
for (tag_info, nl_text) in extractor.nltext_attribute_values():
# tag_info.startline may not be exactly right, since the
# attribute may not be on the same line as the start of the
# tag, but it's hopefully close enough to be useful.
matches.setdefault(nl_text, set()).add((filename, tag_info.startline))
# TODO(csilvers): extract comments from source and append here.
comments.setdefault(nl_text, []).extend([])
# We also need to extract nl-strings from javascript -- in two
# places, text in tags (<script>js</script>) and text in tag
# attributes (<span data-if="js">). We call on babel to help with
# both.
for tag_info in extractor.javascript_nodes():
js_text = extractor.text[tag_info.startpos:tag_info.endpos]
# Babel will report line-numbers starting from 1, but really
# they should be starting from tag_info.startline.
_extract_javascript(filename, js_text, tag_info.startline,
matches, comments)
for (tag_info, js_text) in extractor.javascript_attribute_values():
# tag_info.startline may not be exactly right, since the
# attribute may not be on the same line as the start of the
# tag, but it's hopefully close enough to be useful.
_extract_javascript(filename, js_text, tag_info.startline,
matches, comments)
def _filename_to_url(filename):
"""Convert an exercise filename into a khan academy url."""
# Get the file name of the exercise, to generate a URL reference
basename = os.path.basename(filename)
name = os.path.splitext(basename)[0]
return _EXERCISE_URL % name
class _SetEncoder(json.JSONEncoder):
"""Encode set data structures as lists in JSON encoding.
From: http://stackoverflow.com/a/8230505/6524
"""
def default(self, obj):
if isinstance(obj, set):
return list(obj)
return json.JSONEncoder.default(self, obj)
def babel_extract(fileobj, keywords, comment_tags, options):
"""Babel extraction method for exercises templates.
Arguments:
fileobj: the file-like object the messages should be extracted from,
in this case a single exercise file.
keywords: a list of keywords (i.e. function names) that should be
recognized as translation functions. Ignored.
comment_tags: a list of translator tags to search for and include
in the results. Ignored.
options: a dictionary of additional options (optional)
Returns:
An iterator over (lineno, funcname, message, comments) tuples.
"""
filename = fileobj.name
for (nl_text, comments, occurrences) in extract_files([filename],
verbose=False):
line_numbers = set(o[1] for o in occurrences)
for line_number in line_numbers:
if isinstance(nl_text, basestring):
yield (line_number, '_', nl_text, comments)
else:
yield (line_number, 'ngettext', nl_text, comments)
if __name__ == '__main__':
main()
"""Holds tools needed to translate assessment items."""
import copy
import json
import re
import api.jsonify
import assessment_items.models
from intl import i18n
import intl.regexps
import intl.request
# Matches a number inside perseus text with thousands group separator and
# decimal point. ie 9,100.3
# Inside $...$ though the commas should have surrounding curly braces. ie.
# $9{,}100.3$
_PERSEUS_NUMBER_RE = re.compile('\d+((\{?,\}?)\d{3})*\.?\d*', re.UNICODE)
# After we detect a perseus number string we need to remove the curly braces
# and the comma if we are to cast it to a float
_BAD_FLOAT_CHARS_RE = re.compile('[\{\},]', re.UNICODE)
# In order to detect whether we are inside of outside of $...$ we need to first
# find out how many non-escaped dollar signs come before us. "I owe \$5,000"
# would not match anything since the dollar sign is escaped. Theoretically the
# slash before hand could also be escaped ie. "\\$5{,}000$" in which case the
# dollar sign is for real. For completeness we check for that too.
_NON_ESCAPED_DOLLAR_SIGN_RE = re.compile(r"(?<!\\)(?:\\\\)*\$", re.UNICODE)
def _translate_number(match):
"""Translate a number in perseus text to match request language's # format.
Different langauges use different decimal and group separators. 1,234.56
in English is translated to 1.234,56 in Spanish. If the number appears
within TeX then curly braces are added to surround the comma.
This function will preserve the number of decimal places shown in the
English version. so 5.00 is translated to 5,00 in Spanish.
"""
# TODO(james): Merge this with a similar function in download_i18n.py
# Get the default number pattern for the current language
decimal_format = i18n.request_language_decimal_format()
if not decimal_format:
# Fake languages (ie. boxes & accents) won't have a decimal format.
# So we leave the number untranslated
# TODO(james): consider translating these on the fly as well
return match.group(0)
# We need to remove any curly braces or commas from the match if we are to
# be able to cast this number_string to a float.
number_string = _BAD_FLOAT_CHARS_RE.sub("", match.group(0))
# Find the precision of the fractional part of the number
# ie. for 1.240 -> 3
frac_prec = 0
decimal_pos = number_string.find(".")
if decimal_pos >= 0:
frac_prec = len(number_string) - (decimal_pos + 1)
# Change the decimal format precision (min, max) so they both match
# the precision of the fractional part in the English version to ensure we
# return a string with the same precision. Since we're changing
# global state we make a copy first.
decimal_format = copy.copy(decimal_format)
decimal_format.frac_prec = (frac_prec, frac_prec)
# TODO(james): format_decimal rounds numbers hence "$5.00" -> "$5"
# Figure out how to not get it to round, or to add significant figures
# back in afterwards
translated_number = unicode(i18n.format_decimal(float(number_string),
decimal_format))
# We want to add {} around the number if it is inside $..$ but not if it is
# outside. Unlike where we fully parse the text in download_i18n.py
# automatic number translation, since we have no alpha chachters we can
# assume we are inside $...$ if there are an odd number of non-escaped $s
# before the match.
num_dollar_signs = len(_NON_ESCAPED_DOLLAR_SIGN_RE.findall(
match.string[:match.start()]))
if num_dollar_signs % 2 == 1:
return translated_number.replace(",", "{,}")
return translated_number
def translate_serialized_assessment_item(item):
mo_locale = intl.request.locale_for_mo()
if mo_locale == "en":
# If we are in English don't bother translating as this was the
# language the content was written in.
return item
# keep track of which parts have been translated
translatable_parts = []
# crowdin's jipt (just in place translation) locale used on
# translate.khanacademy.org
jipt_locale = intl.request.jipt_locale_for_mo()
def _maybe_translate(part, attr_or_index):
"""Translate in place all natural language text.
This replaces in memory
the value of attr in the dict part or
the value in index within a list part
with its translation.
"""
text = part[attr_or_index]
if text and not intl.regexps.NO_NEED_TO_TRANSLATE.match(text):
part[attr_or_index] = i18n._(text) # @Nolint - it is ok to be
# translating a variable here.
if jipt_locale and "crwdns" not in part[attr_or_index]:
# We are on translate.ka.org but the string doesn't seem to
# have the jipt tags, so we add a warning message to the user,
# and we don't count this as a translatable part.
part[attr_or_index] = ("$\\large{\\red{\\text{The following "
"content is not yet on crowdin, check "
"back in a week}}}$\n\n%s" %
part[attr_or_index])
else:
translatable_parts.append(text)
elif text:
# No alpha characters, but lets see if it has any numbers in it
# that we could automatically translate.
part[attr_or_index] = _PERSEUS_NUMBER_RE.sub(_translate_number,
text)
item_data = json.loads(item["item_data"])
assessment_items.models.AssessmentItem.traverse_natural_language_parts(
item_data, _maybe_translate)
# If we are on translate.khanacademy.org we want to calculate the
# percentage based upon crowdins jipt (Just in place translation) locale
check_translated_locale = jipt_locale if jipt_locale else mo_locale
if translatable_parts:
parts_translated = sum(
[i18n.has_translation(check_translated_locale, text)
for text in translatable_parts])
item["percent_translated"] = (
parts_translated / float(len(translatable_parts))) * 100
else:
# If there is nothing to translate in this part we assume it is 100%
# translated
item["percent_translated"] = 100.0
item["item_data"] = json.dumps(item_data)
return item
def is_fully_translated(assessment_item):
"""Determine if an assessment item is fully translated in request locale"""
serialized_item = api.jsonify.as_serializable(assessment_item)
translated_item = translate_serialized_assessment_item(serialized_item)
return translated_item.get("percent_translated", 100.0) == 100.0
"""Holds AssessmentItem and related entities.
AssessmentItem: database entity about a single assessment item that can be
included in exercises, and other things in the future
AssessmentItemVersion: database entity about a single version of an assessment
item for which it is a child
"""
import os
from google.appengine.ext import db
from google.appengine.ext import ndb
import api.jsonify
import backup_model
import compat_key
import content
import datetime
import db_util
import layer_cache
import setting_model
@db_util.disable_ndb_memcache
class AssessmentItemTag(backup_model.BackupModelNDB):
_serialize_whitelist = ['id', 'description', 'display_name']
description = ndb.StringProperty()
display_name = ndb.StringProperty()
deleted = ndb.BooleanProperty(indexed=True, default=False)
@property
def id(self):
return self.key.urlsafe()
def validate(self):
# TODO(cbhl): Prevent duplicate tags from being created.
pass
@staticmethod
def create(**kwargs):
"""Create and return a new tag, without putting it to the datastore."""
# Keep all the tags in the same entity group to get strong consistency
item = AssessmentItemTag(parent=ndb.Key(AssessmentItemTag, '0'),
**kwargs)
item.validate()
return item
@staticmethod
def query_all():
return AssessmentItemTag.query(AssessmentItemTag.deleted == False,
ancestor=ndb.Key(AssessmentItemTag, '0'))
@staticmethod
@layer_cache.cache_with_key_params_fxn(
lambda: setting_model.Setting.cached_assessment_item_tags_date,
cache_separately_per_display_language=False)
def fetch_all():
return AssessmentItemTag.query_all().fetch()
@staticmethod
def find_by_display_name(display_name):
# TODO(alpert): Filter by deleted here?
return AssessmentItemTag.query(
AssessmentItemTag.display_name == display_name,
ancestor=ndb.Key(AssessmentItemTag, '0')).get()
class BaseAssessmentItem(object):
content_kind = "AssessmentItem"
content_kind_code = "i"
item_data = ndb.StringProperty(indexed=False)
author_names = ndb.StringProperty(indexed=False, repeated=True)
created_by = ndb.KeyProperty(indexed=True)
tags = ndb.KeyProperty(indexed=True, repeated=True, kind=AssessmentItemTag)
# Name used to refer to this item in the perseus interface
name = ndb.StringProperty(indexed=False)
@staticmethod
def random_id():
"""Generate a random unique identifier for a future new entity.
We override the implementation from BaseRevision because we expect to
have many thousands of items and so the probability of a collision with
32-bit keys is significant."""
return 'x' + os.urandom(8).encode('hex')
@staticmethod
def traverse_item_data(item_data, handle_content, handle_widget):
# We parse item_data using the schema at:
# https://gist.github.com/alopatin/96d923d57bcfb7e8bf4e
# If you change this code make sure to change the corresponding
# translating code in api/v1_assessment_items.py
handle_content(item_data.get("question", {}))
for hint in item_data.get("hints", []):
handle_content(hint)
answer = item_data.get("answerArea", {})
if answer.get("type") == "multiple":
handle_content(answer.get("options", {}))
else:
handle_widget(answer)
@staticmethod
def traverse_natural_language_parts(item_data, handler):
"""Iterate over all the natural language parts in item_data
This will call handler(part, attr_or_index), where the part will either
be a dict and the second arg the key to the natural language within
that dict, or part will be a list and second arg the index to the
natural language part within the list. This allows the handler
function to change the part[attr_or_index] changing its value in memory
We parse the item_data using the schema at:
https://gist.github.com/alopatin/96d923d57bcfb7e8bf4e
If we change that schema, by say adding a new widget, we will need to
update this function.
"""
def handle_list_items(dict_part, attr):
part = dict_part.get(attr, [])
for index in xrange(0, len(part)):
handler(part, index)
def handle_content_in_list_items(dict_part, attr):
parts = dict_part.get(attr, [])
for part in parts:
if part.get("content"):
handler(part, "content")
def handle_widget(widget):
options = widget.get("options", {})
if widget.get("type") in ["radio", "dropdown"]:
handle_content_in_list_items(options, "choices")
elif widget.get("type") == "categorization":
handle_content_in_list_items(options, "items")
handle_list_items(options, "categoryHeaders")
elif widget.get("type") == "plotter":
handle_list_items(options, "labels")
handle_list_items(options, "categories")
elif widget.get("type") == "orderer":
handle_content_in_list_items(options, "options")
handle_content_in_list_items(options, "correctOptions")
handle_content_in_list_items(options, "otherOptions")
elif widget.get("type") == "matcher":
handle_list_items(options, "labels")
handle_list_items(options, "left")
handle_list_items(options, "right")
elif widget.get("type") == "sorter":
handle_list_items(options, "correct")
elif widget.get("type") == "image":
handle_content_in_list_items(options, "labels")
def handle_renderer(renderer):
if renderer.get("content"):
handler(renderer, "content")
for widget in renderer.get("widgets", {}).itervalues():
handle_widget(widget)
AssessmentItem.traverse_item_data(
item_data,
lambda x: handle_renderer(x),
lambda x: handle_widget(x))
def preview_relative_url(self):
return "/preview/content/items/%s" % self.key.id()
def preview_relative_url_in_exercise(self, exercise):
"""Return a url that contains all items in exercise jumping to this one
This presumes that the item is in the exercise passed in. If it is not
that it will just go to the first assessment item in that exercise.
"""
return "/preview/content/items?exercises=%s#%s" % (exercise.name,
self.key.id())
@db_util.disable_ndb_memcache
class AssessmentItem(BaseAssessmentItem,
content.models.VersionedContentNDB):
"""Information about a single assessment item."""
# Metaclass is needed to allow inheriting DB properties from
# BaseAssessmentItem
__metaclass__ = db_util.NDBInheritModelPropertiesType
_serialize_whitelist = [
'id', 'creation_date', 'item_data', 'author_names',
'tags', 'sha', 'name']
edit_revision = ndb.KeyProperty(kind='AssessmentItemRevision',
indexed=False)
is_dirty = ndb.ComputedProperty(lambda self:
self.edit_revision and self.edit_revision.id() != self.sha)
def get_edit_revision(self):
return self.get_edit_revision_async().get_result()
def get_edit_revision_async(self):
if self.edit_revision:
return self.edit_revision.get_async()
else:
return self.get_current_revision_async()
def get_revisions(self):
return (AssessmentItemRevision.query()
.filter(AssessmentItemRevision.content_id == self.id).fetch())
@staticmethod
def create_sorted_revision_list(revisions):
def date_sort(revision):
return revision.creation_date or datetime.datetime.min
sorted_revisions = sorted(revisions, key=date_sort, reverse=True)
# Fetch revision creators
created_by_keys = set()
for revision in revisions:
if revision.created_by:
created_by_keys.add(compat_key.from_(revision.created_by).db)
created_by = {user.key(): user for user in
db.get(list(created_by_keys))}
def get_created_by(revision):
if revision.created_by:
creator = created_by[compat_key.from_(revision.created_by).db]
return creator.nickname
else:
return ""
def get_creation_date(revision):
if revision.creation_date:
return revision.creation_date.date().isoformat()
else:
return ""
return [{
"created_by": get_created_by(revision),
"sha": revision.sha,
"creation_date": get_creation_date(revision),
} for revision in sorted_revisions]
@db_util.disable_ndb_memcache
class AssessmentItemRevision(BaseAssessmentItem,
content.models.BaseRevisionNDB):
"""Information about a single version of an assessment item."""
# Metaclass is needed to allow inheriting DB properties from
# BaseAssessmentItem
__metaclass__ = db_util.NDBInheritModelPropertiesType
@classmethod
def fixup_after_deserialize(cls, properties):
super(AssessmentItemRevision, cls).fixup_after_deserialize(properties)
# Allow item_data to be passed up as a JSON object or string
# TODO(joel) - potentially remove this in the future
if ("item_data" in properties and
type(properties["item_data"]) == dict):
properties["item_data"] = api.jsonify.jsonify(
properties["item_data"])
# Convert string keys to actual keys
if "tags" in properties:
if properties["tags"] is not None:
properties["tags"] = [compat_key.from_(key).ndb for key in
properties["tags"]]
else:
properties["tags"] = []
# Because AssessmentItems are never published, getting
# AssessmentItemRevisions should be made convenient.
@staticmethod
def get_by_sha(sha):
return content.models.BaseRevisionNDB.get_by_sha("AssessmentItem", sha)
@staticmethod
def get_by_sha_list(sha_list):
key_list = [ndb.Key("AssessmentItemRevision", sha) for sha in sha_list]
return compat_key.maybe_get_multi(key_list)
"""A Compile object (see compile_rule.py): translates khan-exercises files."""
import re
from kake import translate_javascript
from kake import translate_util
from kake.lib import compile_rule
class TranslateExercises(translate_util.TranslateBase):
"""Class for translating natural-language text in khan-exercises files."""
_DATA_IF_RE = re.compile(r"data-if\s*=\s*('[^']*'|\"[^\"]*\"|[^'\">\s]+)")
def version(self):
"""Update every time build() changes in a way that affects output."""
return 1
def _ngettext_html(self, text, tag_info, lang, translation):
"""Creates 'translated' html for an ngettext-like span.
In exercises html, ngettext is implemented like so:
<span data-if="isSingular(KEY)" foo=bar>xxx</span>
<span data-else foo=bar>yyy</span>
When translating into Polish, with 4 plural forms, we want to emit
<span data-if="$.ngetpos(KEY, LANG) === 0" foo=bar>aaa</span>
<span data-else-if="$.ngetpos(KEY, LANG) === 1" foo=bar>bbb</span>
<span data-else-if="$.ngetpos(KEY, LANG) === 2" foo=bar>ccc</span>
<span data-else-if="$.ngetpos(KEY, LANG) === 3" foo=bar>ddd</span>
This routine returns a string consisting of the desired output.
"""
old_starttag = text[tag_info.outer_startpos:tag_info.startpos]
old_endtag = text[tag_info.endpos:tag_info.outer_endpos]
ngetpos_key = tag_info.is_singular()
data_if = self._DATA_IF_RE.search(old_starttag)
assert ngetpos_key is not None, tag_info
assert data_if, old_starttag
assert translate_util.SmallMOFile.is_plural(translation), translation
new_text = []
for i in xrange(len(translation)):
if i == 0:
new_data_if = ('data-if="$.ngetpos(%s, \'%s\') === 0"'
% (ngetpos_key, lang))
else:
new_data_if = ('data-else-if="$.ngetpos(%s, \'%s\') === %d"'
% (ngetpos_key, lang, i))
new_text.extend((old_starttag[:data_if.start()],
new_data_if,
old_starttag[data_if.end():],
translation[i],
old_endtag))
return ''.join(new_text)
def translate(self, infile_name, outfile_lang_moentries_context):
# Import here so kake users who don't actually need the
# translate_exercises rule can still import this module
# without needing to bring in exercises.
import exercises.babel
file_contents = self._read_input(infile_name)
# Get all the info about the scripts and nltext in this document.
extractor = exercises.babel.I18nExtractor()
extractor.feed(file_contents)
nltext_info = list(extractor.nltext_nodes())
nltext_in_attr_info = list(extractor.nltext_attribute_values())
script_info = list(extractor.javascript_nodes())
script_in_attr_info = list(extractor.javascript_attribute_values())
js_translator = translate_javascript.TranslateJavascript()
for (outfile, lang, mo_entries, _) in outfile_lang_moentries_context:
# Keep track of the rewrites we're supposed to do.
rewrites = [] # each entry is (startpos, endpos, new_text)
# Handle javascript in, e.g., "<script>js</script>"
for tag_info in script_info:
# Use a javascript-injector to handle the javascript
# inside the exercises.
script_contents = file_contents[tag_info.startpos:
tag_info.endpos]
babel_output = js_translator.extract_nltext(script_contents)
translated_js = js_translator.translate_to_lang(
babel_output, script_contents, lang, mo_entries)
# Only inject the string if there is a translation
if translated_js is not None:
rewrites.append((tag_info.startpos, tag_info.endpos,
translated_js))
# Handle javascript in, e.g., "<span data-if="js">...</span>"
for (tag_info, attrval_string) in script_in_attr_info:
# tag_info is the tag-info for the span, attrval_string is js.
babel_output = js_translator.extract_nltext(attrval_string)
translated_js = js_translator.translate_to_lang(
babel_output, attrval_string, lang, mo_entries)
if translated_js is not None:
# We need to figure out where attrval_string is inside
# tag_info. TODO(csilvers): have attrval_pos passed in.
attrval_pos = file_contents.index(attrval_string,
tag_info.outer_startpos,
tag_info.startpos)
rewrites.append((attrval_pos,
attrval_pos + len(attrval_string),
translated_js))
# Handle text.
last_was_singular = False # used when we're injecting ngettext
for tag_info in nltext_info:
if last_was_singular:
# For ngettext, we want to replace the English
# singular + plural spans with a bunch of
# translated spans (one for each plural). We
# already deleted the English singular and
# inserted the new stuff, so all that's left is to
# delete the English plural.
rewrites.append((tag_info.outer_startpos,
tag_info.outer_endpos,
''))
last_was_singular = False
continue
(cleaned_contents, _) = extractor.cleaned_text(tag_info)
if tag_info.is_singular():
# An ngettext call. We replace the English singular
# with a bunch of translated spans (one per plural).
translation = mo_entries.get_plural_translation(
cleaned_contents)
if translation: # a translation was found
new_text = self._ngettext_html(file_contents, tag_info,
lang, translation)
rewrites.append((tag_info.outer_startpos,
tag_info.outer_endpos,
new_text))
last_was_singular = True
else:
# A gettext call. We can replace the text between
# the tags with translated text.
translation = mo_entries.get_singular_translation(
cleaned_contents)
if translation:
rewrites.append((tag_info.startpos, tag_info.endpos,
translation))
# Handle text in, e.g., "<input value="text">"
for (tag_info, attrval_string) in nltext_in_attr_info:
# tag_info is the tag-info for the input, attrval is text.
translation = mo_entries.get_singular_translation(
attrval_string)
if translation:
# We need to figure out where attrval_string is inside
# tag_info. TODO(csilvers): have attrval_pos passed in.
attrval_pos = file_contents.index(attrval_string,
tag_info.outer_startpos,
tag_info.startpos)
rewrites.append((attrval_pos,
attrval_pos + len(attrval_string),
translation))
# Small optimization to catch a case we know output is unchanged.
if not rewrites:
translated_contents = None
else:
# Construct the new file from the original file + rewrites.
translated_contents = []
rewrites.sort() # get in startpos order
i = 0
for (startpos, endpos, new_text) in rewrites:
translated_contents.append(file_contents[i:startpos])
translated_contents.append(new_text)
i = endpos
translated_contents.append(file_contents[i:])
translated_contents = ''.join(translated_contents)
if translated_contents == file_contents:
translated_contents = None # signal that output == input
self._write_output(infile_name, outfile, translated_contents)
compile_rule.register_compile(
'TRANSLATED KA-EXERCISE',
'genfiles/translations/{lang}/khan-exercises/{{path}}.html',
['khan-exercises/{{path}}.html',
('genfiles/extracted_strings/{lang}/'
'khan-exercises/{{path}}.html.small_mo.pickle')],
TranslateExercises(),
# small_mo.pickle files are recreatd every time {lang}.po files
# change, but their contents usually don't change, so crc's are
# good for us.
compute_crc=True)
"""A Compile object (see compile_rule.py): translates .js files."""
import cStringIO
import json
import re
import third_party.babel.messages.extract
import intl.data
import intl.english_only
from kake import translate_util
from kake.lib import compile_rule
class TranslateJavascript(translate_util.TranslateBase):
"""Class for translating natural-language text in javascript files."""
_JS_GETTEXT_RE = re.compile(r'\$\._|\$\.ngettext|\$_')
_BABEL_KW = third_party.babel.messages.extract.DEFAULT_KEYWORDS.copy()
# <$_> in jsx expands to $_({varmap}, "string", ...), so kw-index is 2.
_BABEL_KW['$_'] = (2,) # used in .jsx files as <$_>
def version(self):
"""Update every time build() changes in a way that affects output."""
return 1
def extract_nltext(self, file_contents):
# Extract the messages from the JavaScript file with the
# appropriate start and end positions as well. As a small
# efficiency short-cut, we can say that if none of ($._,
# $.ngettext, $_) are present in file_contents, there won't be
# any translations to do.
if not self._JS_GETTEXT_RE.search(file_contents):
return []
else:
# Convert it to a StringIO buffer for pybabel to handle.
# pybabel expects utf-8 encoded input, so that's what we give it.
input = cStringIO.StringIO(file_contents.encode('utf-8'))
r = third_party.babel.messages.extract.extract_javascript(
input, self._BABEL_KW, [], {'messages_only': True})
return list(r) # convert from iterator if need be
def translate_to_lang(self, babel_output, file_contents, lang, mo_entries):
# Keep track of if the translated file differs from the original.
has_diff = False
# Go through all of the matched messages in reverse (to avoid
# having to deal with the changes in position of the messages).
for (messages, start, end) in reversed(babel_output):
# Figure out the lookup key. extract_javascript returns a
# list if a plural is found.
key = messages
if isinstance(key, basestring): # singular
key = messages
translation = mo_entries.get_singular_translation(key)
if not translation:
continue
insert_text = json.dumps(translation)
elif messages[0] is None: # jsx-style $_()
# For the jsx $_() operator, the gettext string is the
# *second* argument (the first argument is the value-dict).
key = messages[1]
translation = mo_entries.get_singular_translation(key)
if not translation:
continue
insert_text = json.dumps(translation)
else: # plural
key = messages[0]
translation_dict = mo_entries.get_plural_translation(key)
if not translation_dict:
continue
# We need the messages to be sorted by index.
index_and_messages = sorted(translation_dict.iteritems())
messages = [message for (_, message) in index_and_messages]
# We store the info we need in legal javascript format.
insert_text = json.dumps({"lang": lang, "messages": messages})
# Insert the string at the right position.
file_contents = ''.join((file_contents[:start],
insert_text,
file_contents[end:]))
# A change was made to the file.
has_diff = True
if has_diff:
return file_contents
else:
return None # signals that output == input
def translate(self, infile_name, outfile_lang_moentries_context):
if intl.english_only.should_not_translate(infile_name):
# If we shouldn't translate it, we can just symlink it!
for (outfile, _, _, _) in outfile_lang_moentries_context:
self._write_output(infile_name, outfile, None)
return
file_contents = self._read_input(infile_name)
babel_output = self.extract_nltext(file_contents)
for (outfile, lang, mo_entries, _) in outfile_lang_moentries_context:
# Get the translation, or None if output == input.
translated_contents = self.translate_to_lang(
babel_output, file_contents, lang, mo_entries)
self._write_output(infile_name, outfile, translated_contents)
# These rules are only used in dev (where we don't compress js), and
# for js worker files, which we likewise translate without compressing.
compile_rule.register_compile(
'TRANSLATED RAW JS FILES',
'genfiles/translations/{lang}/{{path}}.js',
['{{path}}.js',
'genfiles/extracted_strings/{lang}/{{path}}.js.small_mo.pickle'],
TranslateJavascript(),
# small_mo.pickle files are recreatd every time {lang}.po files
# change, but their contents usually don't change, so crc's are
# good for us.
compute_crc=True)
# This catches files that are compiled (or transpiled) into js.
dirs_with_js = ('genfiles/compiled_{type}',
# This is a special case (bundle.js has its own directory)
'genfiles/khan-exercises',
)
for d in dirs_with_js:
translate_util.register_translatesafe_compile(
'TRANSLATED JS FILES (%s)' % d,
'%s/{lang}/{{path}}.js' % d,
['%s/en/{{path}}.js' % d,
('genfiles/extracted_strings/{lang}/'
'%s/{lang}/{{path}}.js.small_mo.pickle' % d)],
TranslateJavascript(),
compute_crc=True)
# This is the rule used in prod, where we only translate javascript
# after it's been compressed. The exception is handlebars files,
# which are translated before they're even converted to javascript, in
# compile_handlebars.py. Luckily, the special-case rule for the
# handlebars files (in compress_js.py) has higher precedence than this
# rule, so we can be fully general here.
# We use 'trumped_by' to make sure this rule doesn't apply when lang=en,
# and also to make sure this rule doesn't apply when translating handlebars.
translate_util.register_translatesafe_compile(
'TRANSLATED COMPRESSED JS FILES',
'genfiles/compressed_javascript/{lang}/{{path}}.min.js',
['genfiles/compressed_javascript/en/{{path}}.min.js',
('genfiles/extracted_strings/{lang}/genfiles/compressed_javascript/{lang}'
'/{{path}}.min.js.small_mo.pickle')],
TranslateJavascript(),
trumped_by=['COMPRESSED JS', 'COMPRESSED TRANSLATED HANDLEBARS JS'],
compute_crc=True,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment