aronasorman/babel.py Secret

## babel.py
#!/usr/bin/env python

"""Extracts translatable strings from HTML exercise files.

This program is used for extracting translatable strings from
exercise files and optionally outputting a PO or JSON file to be used
for further translation.

How it works: The script goes through the HTML files and attempts
to locate all nodes that have text as a direct child node. It then
extracts the HTML contents of that node and returns it as a translatable
string. Note that certain nodes are excluded from this list as they
contain non-translatable text (see _IGNORE_NODES). It is assumed that
everything that isn't part of _IGNORE_NODES is something that needs to be
translated.

We also call out to babel to extract javascript inside exercises (in
<script> tags, but also in some other places where javascript is
allowed in exercise html files).
"""

import cStringIO
import HTMLParser
import argparse
import json
import os.path
import re
import sys

# Make sure we can import third_party even when run from the commandline.
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import third_party.babel.messages.extract
from third_party import i18nize_templates
from third_party import polib


# All the tags that we want to treat as javascript.  We'll call the
# javascript extractor on text in these tags, to extract out the
# natural language text from javascript.
_JAVASCRIPT_TAGS = frozenset((
    'script',
    'var',
    ))

# We also extract javascript from the innerhtml of tags that contain a
# given attribute/value pair.  We consider a tag to hold javascript if
# it has the given attribute name and *CONTAINS* the given value text
# inside its attribute value, so '<span class="foo-guess-bar">' matches.
_JAVASCRIPT_ATTRVALS = frozenset((
        ('class', 'graphie'),
        ('class', 'guess'),
        ('class', 'validator-function'),
        ))

# TODO(csilvers): Convert to a more efficient data structure.
_JAVASCRIPT_ATTRVAL_ATTRS = frozenset((attr for (attr, _) in
                                       _JAVASCRIPT_ATTRVALS))

# Unlike _JAVASCRIPT_ATTRVALS, which says a tag contains javascript if
# it has a given attrval, this says that the attribute value *itself*
# is javascript.  For instance <span data-if="<some javascript>">.
_JAVASCRIPT_ATTRS = frozenset((
    'data-choices',
    'data-else',
    'data-ensure',
    'data-if',
    'data-if-else',
    ))

# All the tags that we want to ignore and not extract strings from.
_IGNORE_TAGS = frozenset((
        'code',
        'style',
))

# Ignore all tags with this attribute name/value pair.  We ignore a
# tag if it has the given attribute name and *CONTAINS* the given
# value text inside its attribute value, so
# '<span data-type="a-regexp">' matches.
_IGNORE_ATTRVALS = frozenset((
        ('data-type', 'regex'),
        ))

# TODO(csilvers): Convert to a more efficient data structure.
_IGNORE_ATTRVAL_ATTRS = frozenset((attr for (attr, _) in _IGNORE_ATTRVALS))


# The base URL for referencing an exercise
_EXERCISE_URL = 'http://www.khanacademy.org/exercise/%s'


class UnmatchedEndTagError(Exception):
    def __init__(self, tagname, linenum, colnum):
        super(UnmatchedEndTagError, self).__init__()
        self.linenum = linenum
        self.colnum = colnum
        self.tagname = tagname

    def __str__(self):
        # Not super-useful since we don't know the filename!
        return ('No start tag found for end-tag </%s> at line %s, col %s'
                % (self.tagname, self.linenum, self.colnum))


def main():
    """Handle running this program from the command-line."""
    # Handle parsing the program arguments
    arg_parser = argparse.ArgumentParser(
        description='Extract translatable strings from HTML exercise files.')
    arg_parser.add_argument('html_files', nargs='+',
        help='The HTML exercise files to extract strings from.')
    arg_parser.add_argument('--output', dest='output',
        help='The file to write the output to.')
    arg_parser.add_argument('--format', choices=['po', 'json'],
        dest='format', default='po',
        help='The format of the output. (default: %(default)s)')
    arg_parser.add_argument('--quiet', action='store_true',
        help='Do not emit status to stderr on successful runs.')

    args = arg_parser.parse_args()

    if args.format == 'po':
        # Output a PO file by default
        results = unicode(make_potfile(args.html_files,
                                       not args.quiet)).encode('utf-8')
    else:
        # Optionally output a JSON-encoded data structure
        results = json.dumps(extract_files(args.html_files, not args.quiet),
                             cls=_SetEncoder, indent=2)

    if args.output:
        # If an output location is specified, write the output to that file
        output_file = open(args.output, 'w')
        output_file.write(results)
        output_file.close()
    else:
        # Otherwise just write the output to STDOUT
        print results


def make_potfile(files, verbose):
    """Generate a PO file from a collection of HTML files.

    Returns the string representing the PO file.
    """
    # Turn off line-wrapping: it can mess with html markup inside PO comments.
    output_pot = polib.POFile(wrapwidth=sys.maxint, encoding='utf-8')
    matches = extract_files(files, verbose)

    for (nl_text, comments, occurrences) in matches:
        # Build the PO entry and add it to the PO file
        # If nltext is a tuple, it means we have a plural (ngettext) entry
        if isinstance(nl_text, basestring):
            (msgid, msgid_plural) = (unicode(nl_text), None)
            (msgstr, msgstr_plural) = ("", None)
        else:
            (msgid, msgid_plural) = (unicode(nl_text[0]), unicode(nl_text[1]))
            (msgstr, msgstr_plural) = (None, {0: u"", 1: u""})

        output_pot.append(polib.POEntry(
            msgid=msgid,
            msgid_plural=msgid_plural,
            msgstr=msgstr,
            msgstr_plural=msgstr_plural,
            comment='\n'.join(comments),
            occurrences=occurrences,
        ))

    return output_pot


def extract_files(files, verbose):
    """Extract a collection of translatable strings from a set of HTML files.

    Returns:
        A list of natural language texts and their occurrences:
            [(nl-text, (comment, ...),
              ((1st-file, 1st-linenum), (2nd-file, 2nd-linenum), ...)),
                ...
            ]
        For each nl-text, the (file, linenum) pairs are sorted in
        lexicographic order (first by filename, then by line-number).
        The list of natural-language texts is sorted by the (1st-file,
        1st-linenum), to maximize the chances texts from the same file
        will sort together.
    """
    matches = {}
    comments = {}

    # Go through all the exercise files.
    for filename in files:
        if verbose:
            print >>sys.stderr, 'Extracting strings from: %s' % filename
        extract_file(filename, matches, comments)

    if verbose:
        num_matches = len(matches)
        print >>sys.stderr, ('%s string%s extracted.'
                             % (num_matches, "" if num_matches == 1 else "s"))

    # Get the matches into the return format.
    retval = []
    for (nl_text, occurrences) in matches.iteritems():
        occurrences = sorted(occurrences)
        # Get the file name of the exercise, to generate a URL reference
        first_filename = occurrences[0][0]

        msg_comments = list(comments.get(nl_text, []))
        msg_comments.append(
            unicode('Text is in <%s>' % _filename_to_url(first_filename)))

        retval.append((nl_text, msg_comments, occurrences))

    # Now sort the nl-texts so they come in order of their first occurrence.
    # We break ties -- should be rare -- arbitrarily.
    retval.sort(key=lambda (nl_text, comments, occ): (occ[0], nl_text))

    return retval


class I18nExtractor(HTMLParser.HTMLParser):
    """Lexes html, and returns interesting tags via get_node().

    'Interesting' tags are those that a) have text directly in them
    (that is, not inside some nested tag), b) are not in _IGNORE_TAGS
    or _IGNORE_ATTRVALS, and c) are not nested inside another
    'interesting' tag.  The data returned is the full html of that
    tag: that is, everything between <tag> and </endtag>
    (non-inclusive).  Whitespace in the return value is collapsed.

    This assumes well-formed html!  It will do weird things otherwise.

    NOTE: This class is used by webapp:kake/translate_exercises.py,
    so be careful about code there if you refactor here.
    """
    class TagInfo(object):
        def __init__(self, tagname, attrs,
                     startline, startpos, outer_startpos, parent_tag_info):
            self.tagname = tagname
            self.attrs = attrs
            self.should_emit_tag = True            # start optimistic
            self.tag_has_non_whitespace = False    # start pessimistic
            self.startline = startline             # line # into text
            self.startpos = startpos               # offset into text
            self.endpos = None                     # will point after text
            self.outer_startpos = outer_startpos   # offset into tag
            self.outer_endpos = None               # will point after close-tag

            # We know right away we shouldn't emit a tag if any of the
            # following are true: we shouldn't emit the tag's parent,
            # the tag is in _IGNORE_TAGS, or the tag has some
            # attribute values that are in _IGNORE_ATTRVALS.
            if self.should_emit_tag:
                self.should_emit_tag = (not parent_tag_info or
                                        parent_tag_info.should_emit_tag)
            if self.should_emit_tag:
                self.should_emit_tag = not self._tag_matches(
                    _IGNORE_TAGS, _IGNORE_ATTRVAL_ATTRS, _IGNORE_ATTRVALS)

        _IS_SINGULAR_RE = re.compile('^isSingular\((.*)\)$')

        def _tag_matches(self, tagnames, attrs, attrvals):
            """True if our tag is in tagnames or an attr/val is in attrvals."""
            if self.tagname in tagnames:
                return True
            for attrval in self.attrs:
                if attrval[0] in attrs:
                    for (match_attr, match_tag) in attrvals:
                        if (match_attr == attrval[0] and
                            match_tag in attrval[1]):   # *CONTAINS*
                            return True
            return False

        def is_javascript_tag(self):
            """True if this tag contains javascript in its innerhtml."""
            return self._tag_matches(_JAVASCRIPT_TAGS,
                                     _JAVASCRIPT_ATTRVAL_ATTRS,
                                     _JAVASCRIPT_ATTRVALS)

        def nltext_attrvals(self):
            """Return attribute values for value/etc attributes in tag."""
            # We re-use the code in i18nize_templates which already
            # finds natural language text in tag attributes.
            return [self.attrs[i][1]
                    for i in i18nize_templates.natural_language_attributes(
                        self.tagname, self.attrs)]

        def javascript_attrvals(self):
            """Return attribute values for data-if/etc attributes in tag."""
            return [val for (attr, val) in self.attrs
                    if attr in _JAVASCRIPT_ATTRS]

        def is_singular(self):
            """foo if this tag has an data-if="isSingular(foo)", else None."""
            for (attr, val) in self.attrs:
                if attr == 'data-if':
                    m = self._IS_SINGULAR_RE.match(val)
                    if m:
                        return m.group(1)
            return None

    def __init__(self, *args, **kwargs):
        HTMLParser.HTMLParser.__init__(self, *args, **kwargs)
        self.tagstack = []         # stack (list) of TagInfo's.
        self.candidates = []       # tags that we provisionally should emit
        self.script_nodes = []     # TagInfos for js content in this html
        self.script_attrvals = []  # (TagInfo, attr-val) where attrval is js
        self.nltext_attrvals = []  # (TagInfo, attr-val) where attrval is text

    def _line_offset_to_pos(self, line_and_offset):
        """Input is a tuple (5, 10): 10th char of the 5th line."""
        return self.linepos[line_and_offset[0]] + line_and_offset[1]

    # Older versions of HTMLParser (2.7.1, at least) have a bug where
    # they end <script> tags on *any* </tag>, not just </script>.  Fix
    # that.  Later versions of Python fix the bug but add a new arg
    # to set_cdata_mode(), so we have to handle that too.
    def set_cdata_mode(self, *args):
        self.interesting = re.compile(r'</%s' % re.escape(self.lasttag))

    # Sadly, there's no regexp pattern that just matches unicode
    # alphabetic characters, so we do the inverse: non-alnums + digits
    # = non-alphabetic.
    _NO_ALPHA_RE = re.compile('^[\W\d]+$', re.UNICODE)

    def handle_starttag(self, tag, attrs):
        # Read past the start-tag; startpos is the start of the 'inner' html.
        startline = self.getpos()[0]
        outer_startpos = self._line_offset_to_pos(self.getpos())
        startpos = outer_startpos + len(self.get_starttag_text())
        taginfo = I18nExtractor.TagInfo(
            tag, attrs, startline, startpos, outer_startpos,
            self.tagstack[-1] if self.tagstack else None)

        # Sometimes tags have attributes whose values are natural
        # language text (e.g. <input value="some text">).  If so,
        # store that fact.
        for nltext_val in taginfo.nltext_attrvals():
            if nltext_val and not self._NO_ALPHA_RE.match(nltext_val):
                self.nltext_attrvals.append((taginfo, nltext_val))

        # Sometimes tags have attributes whose values are javascript
        # (e.g. <span data-if="some_javascript_code">).  If so, store
        # that fact.
        for js_val in taginfo.javascript_attrvals():
            if js_val:
                self.script_attrvals.append((taginfo, js_val))

        self.tagstack.append(taginfo)

    def handle_endtag(self, tag):
        # We need the while because not all tags have end-tags (e.g. <meta>)
        while self.tagstack and self.tagstack[-1].tagname != tag:
            self.tagstack.pop()
        # This can fail if the html is not well-formed (no balanced tags)
        if not self.tagstack:
            (linenum, colnum) = self.getpos()
            raise UnmatchedEndTagError(tag, linenum, colnum)
        tag_info = self.tagstack.pop()

        # Update endpos
        tag_info.endpos = self._line_offset_to_pos(self.getpos())
        # outer_endpos points after the end of this tag.  HTMLParser
        # doesn't expose this info, so we depend on the fact end-tags
        # can't have tag-attrs, so searching for '>' is good enough.
        tag_info.outer_endpos = self.text.index('>', tag_info.endpos) + 1

        # If the tag's innerhtml is javascript, add it to our list of
        # javascript nodes.  Else if tagname-is-good is set, and the
        # tag contains non-ws text, then its contents are a candidate
        # to be extracted.  However, its parents get dibs: we don't
        # extract this if we extract a parent.  We will have to wait
        # until the parent is done, to see.
        if tag_info.is_javascript_tag():
            self.script_nodes.append(tag_info)
        elif tag_info.should_emit_tag and tag_info.tag_has_non_whitespace:
            self.candidates.append(tag_info)

    def handle_data(self, data):
        """Callback for text between tags."""
        if data.strip():      # not just whitespace
            assert self.tagstack
            self.tagstack[-1].tag_has_non_whitespace = True

    def handle_charref(self, charref):
        """Callback for data that starts with &, e.g. &apos;."""
        assert self.tagstack
        self.tagstack[-1].tag_has_non_whitespace = True

    def feed(self, text):
        """Store the text so we can print from it, and make line->pos table."""
        self.text = text
        self.linepos = [None, 0]   # dummy 0-th line; linenums start at 1
        while True:
            newline = text.find('\n', self.linepos[-1])
            if newline == -1:
                break
            self.linepos.append(newline + 1)

        HTMLParser.HTMLParser.feed(self, text)

    def cleaned_text(self, tag_info):
        """Return text between <tag> and </tag>, cleans up whitespaces.

        Get rid of leading and trailing whitespace, and collapse runs of
        whitespace to a single whitespace (changing newline to space).

        Returns:
           The cleaned text, and the line-number that the cleaned
           text starts on.
        """
        text = self.text[tag_info.startpos:tag_info.endpos]

        # Figure out the line number of the first non-whitespace char.
        line_number = tag_info.startline
        leading_whitespace = text[:len(text) - len(text.lstrip())]
        line_number += leading_whitespace.count('\n')

        # Normalize whitespace and return.
        return (re.sub(r'\s+', ' ', text).strip(), line_number)

    def nltext_nodes(self):
        """Yields TagInfo objects representing nodes with nl-text in them."""
        # If one candidate is inside another one, we print the outside
        # one.  We can figure this out via sorting.
        self.candidates.sort(key=lambda tag_info: tag_info.startpos)
        if self.candidates:
            yield self.candidates[0]
            parent_range = (self.candidates[0].startpos,
                            self.candidates[0].endpos)
            for i in xrange(1, len(self.candidates)):
                # If we're entirely inside our parent, ignore us.
                if (self.candidates[i].startpos >= parent_range[0] and
                    self.candidates[i].endpos <= parent_range[1]):
                    continue
                yield self.candidates[i]
                parent_range = (self.candidates[i].startpos,
                                self.candidates[i].endpos)

    def javascript_nodes(self):
        """Yields TagInfo objects representing nodes with js-text in them."""
        for taginfo in self.script_nodes:
            yield taginfo

    def javascript_attribute_values(self):
        """Yields (TagInfo, attribute_value_string) when attr-value is js."""
        for (taginfo, attrval_string) in self.script_attrvals:
            yield (taginfo, attrval_string)

    def nltext_attribute_values(self):
        """Yields (TagInfo, attribute_value_string) when attr-value is text."""
        for (taginfo, attrval_string) in self.nltext_attrvals:
            yield (taginfo, attrval_string)


_JS_GETTEXT_RE = re.compile(r'\$\._|\$\.ngettext')


def javascript_has_no_i18n_markup(js_text):
    """Return true if js_text does not have $._ or $.ngettext in it.

    This is an optimization -- if a quick search for $._ or $.ngettext
    fails, then we know we don't need to do the more expensive
    javascript tokenization looking for strings to extract.  This is
    because the only time we have strings to extract in js is in $._()
    and $.ngettext().

    Note that this can return False even if there's no actual $._ in
    the code (for instance, if '$._' appears in a comment.  That's ok
    though; it just means we don't benefit from the optimization for
    this file.
    """
    return not _JS_GETTEXT_RE.search(js_text)


def _extract_javascript(filename, js_text, js_start_line, matches, comments):
    """js_start_line: where this javascript starts inside the html file."""
    # All i18n markup in javascript is via $._ and $.ngettext, so if
    # those aren't present, we can bail, confident in the lack of nltext.
    if javascript_has_no_i18n_markup(js_text):
        return
    for (lineno, funcname, message, msg_comments) in (
        third_party.babel.messages.extract.extract_javascript(
            cStringIO.StringIO(js_text.encode('utf-8')),
            third_party.babel.messages.extract.DEFAULT_KEYWORDS,
            ['I18N:'],
            {})):
        # the javascript extractor has a 'feature' where it appends a
        # third None argument if an ngettext message has interpolated
        # strings ("%(foo)s").  We ignore that.
        if isinstance(message, tuple):
            message = message[:2]
        matches.setdefault(message, set()).add((filename,
                                                js_start_line - 1 + lineno))
        comments.setdefault(message, []).extend(msg_comments)


def extract_file(filename, matches, comments, contents=None):
    """Extract a collection of translatable strings from an HTML file.

    This function modifies matches and comments in place with new
    content that it discovers.

    Arguments:
        filename: the .html file to extract natural language text from.
        matches: a dict from found nl-strings to a set of
            (filename, linenumber) pairs where this string is found.
        comments: a dict from found nl-strings to a list of
            comments about those strings (extracted from the source code).
        contents: if not None, the contents of file 'filename'.
    """
    if contents is None:
        with open(filename) as f:
            contents = f.read().decode('utf-8')
    extractor = I18nExtractor()
    extractor.feed(contents)

    singular = None   # used when collecting singular + plural for ngettext
    singular_occ = None
    for tag_info in extractor.nltext_nodes():
        (text, linenum) = extractor.cleaned_text(tag_info)
        if singular is not None:
            # If the last tag was the singular part of an ngettext
            # call, we're the plural.
            matches.setdefault((singular, text), set()).add(singular_occ)
            # TODO(csilvers): extract comments from source and append here.
            comments.setdefault((singular, text), []).extend([])
            singular = None
            singular_occ = None
        elif tag_info.is_singular():
            # If *we're* the singular part of an ngettext call, store
            # that info so the next tag can add us to matches.
            singular = text
            singular_occ = (filename, linenum)
        else:
            # Normal, gettext call.
            matches.setdefault(text, set()).add((filename, linenum))
            # TODO(csilvers): extract comments from source and append here.
            comments.setdefault(text, []).extend([])

    # We also need to worry about natural language text inside tags,
    # such as <input value="natural language text">.
    for (tag_info, nl_text) in extractor.nltext_attribute_values():
        # tag_info.startline may not be exactly right, since the
        # attribute may not be on the same line as the start of the
        # tag, but it's hopefully close enough to be useful.
        matches.setdefault(nl_text, set()).add((filename, tag_info.startline))
        # TODO(csilvers): extract comments from source and append here.
        comments.setdefault(nl_text, []).extend([])

    # We also need to extract nl-strings from javascript -- in two
    # places, text in tags (<script>js</script>) and text in tag
    # attributes (<span data-if="js">).  We call on babel to help with
    # both.
    for tag_info in extractor.javascript_nodes():
        js_text = extractor.text[tag_info.startpos:tag_info.endpos]
        # Babel will report line-numbers starting from 1, but really
        # they should be starting from tag_info.startline.
        _extract_javascript(filename, js_text, tag_info.startline,
                            matches, comments)

    for (tag_info, js_text) in extractor.javascript_attribute_values():
        # tag_info.startline may not be exactly right, since the
        # attribute may not be on the same line as the start of the
        # tag, but it's hopefully close enough to be useful.
        _extract_javascript(filename, js_text, tag_info.startline,
                            matches, comments)


def _filename_to_url(filename):
    """Convert an exercise filename into a khan academy url."""
    # Get the file name of the exercise, to generate a URL reference
    basename = os.path.basename(filename)
    name = os.path.splitext(basename)[0]
    return _EXERCISE_URL % name


class _SetEncoder(json.JSONEncoder):
    """Encode set data structures as lists in JSON encoding.

    From: http://stackoverflow.com/a/8230505/6524
    """
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)


def babel_extract(fileobj, keywords, comment_tags, options):
    """Babel extraction method for exercises templates.

    Arguments:
      fileobj: the file-like object the messages should be extracted from,
               in this case a single exercise file.
      keywords: a list of keywords (i.e. function names) that should be
                recognized as translation functions.  Ignored.
      comment_tags: a list of translator tags to search for and include
                    in the results.  Ignored.
      options: a dictionary of additional options (optional)

    Returns:
      An iterator over (lineno, funcname, message, comments) tuples.
    """
    filename = fileobj.name
    for (nl_text, comments, occurrences) in extract_files([filename],
                                                          verbose=False):
        line_numbers = set(o[1] for o in occurrences)
        for line_number in line_numbers:
            if isinstance(nl_text, basestring):
                yield (line_number, '_', nl_text, comments)
            else:
                yield (line_number, 'ngettext', nl_text, comments)


if __name__ == '__main__':
    main()

## i18n.py
"""Holds tools needed to translate assessment items."""

import copy
import json
import re

import api.jsonify
import assessment_items.models
from intl import i18n
import intl.regexps
import intl.request

# Matches a number inside perseus text with thousands group separator and
# decimal point. ie 9,100.3
# Inside $...$ though the commas should have surrounding curly braces. ie.
# $9{,}100.3$
_PERSEUS_NUMBER_RE = re.compile('\d+((\{?,\}?)\d{3})*\.?\d*', re.UNICODE)

# After we detect a perseus number string we need to remove the curly braces
# and the comma if we are to cast it to a float
_BAD_FLOAT_CHARS_RE = re.compile('[\{\},]', re.UNICODE)

# In order to detect whether we are inside of outside of $...$ we need to first
# find out how many non-escaped dollar signs come before us. "I owe \$5,000"
# would not match anything since the dollar sign is escaped. Theoretically the
# slash before hand could also be escaped ie. "\\$5{,}000$" in which case the
# dollar sign is for real. For completeness we check for that too.
_NON_ESCAPED_DOLLAR_SIGN_RE = re.compile(r"(?<!\\)(?:\\\\)*\$", re.UNICODE)


def _translate_number(match):
    """Translate a number in perseus text to match request language's # format.

    Different langauges use different decimal and group separators.  1,234.56
    in English is translated to 1.234,56 in Spanish.  If the number appears
    within TeX then curly braces are added to surround the comma.

    This function will preserve the number of decimal places shown in the
    English version. so 5.00 is translated to 5,00 in Spanish.
    """
    # TODO(james): Merge this with a similar function in download_i18n.py

    # Get the default number pattern for the current language
    decimal_format = i18n.request_language_decimal_format()

    if not decimal_format:
        # Fake languages (ie. boxes & accents) won't have a decimal format.
        # So we leave the number untranslated
        # TODO(james): consider translating these on the fly as well
        return match.group(0)

    # We need to remove any curly braces or commas from the match if we are to
    # be able to cast this number_string to a float.
    number_string = _BAD_FLOAT_CHARS_RE.sub("", match.group(0))

    # Find the precision of the fractional part of the number
    # ie. for 1.240 -> 3
    frac_prec = 0
    decimal_pos = number_string.find(".")
    if decimal_pos >= 0:
        frac_prec = len(number_string) - (decimal_pos + 1)

    # Change the decimal format precision (min, max) so they both match
    # the precision of the fractional part in the English version to ensure we
    # return a string with the same precision.  Since we're changing
    # global state we make a copy first.
    decimal_format = copy.copy(decimal_format)
    decimal_format.frac_prec = (frac_prec, frac_prec)

    # TODO(james): format_decimal rounds numbers hence "$5.00" -> "$5"
    # Figure out how to not get it to round, or to add significant figures
    # back in afterwards
    translated_number = unicode(i18n.format_decimal(float(number_string),
                                                    decimal_format))

    # We want to add {} around the number if it is inside $..$ but not if it is
    # outside.  Unlike where we fully parse the text in download_i18n.py
    # automatic number translation, since we have no alpha chachters we can
    # assume we are inside $...$ if there are an odd number of non-escaped $s
    # before the match.
    num_dollar_signs = len(_NON_ESCAPED_DOLLAR_SIGN_RE.findall(
        match.string[:match.start()]))
    if num_dollar_signs % 2 == 1:
        return translated_number.replace(",", "{,}")

    return translated_number


def translate_serialized_assessment_item(item):
    mo_locale = intl.request.locale_for_mo()
    if mo_locale == "en":
        # If we are in English don't bother translating as this was the
        # language the content was written in.
        return item

    # keep track of which parts have been translated
    translatable_parts = []
    # crowdin's jipt (just in place translation) locale used on
    # translate.khanacademy.org
    jipt_locale = intl.request.jipt_locale_for_mo()

    def _maybe_translate(part, attr_or_index):
        """Translate in place all natural language text.

        This replaces in memory
        the value of attr in the dict part or
        the value in index within a list part
        with its translation.
        """
        text = part[attr_or_index]

        if text and not intl.regexps.NO_NEED_TO_TRANSLATE.match(text):
            part[attr_or_index] = i18n._(text)  # @Nolint - it is ok to be
                                                # translating a variable here.
            if jipt_locale and "crwdns" not in part[attr_or_index]:
                # We are on translate.ka.org but the string doesn't seem to
                # have the jipt tags, so we add a warning message to the user,
                # and we don't count this as a translatable part.
                part[attr_or_index] = ("$\\large{\\red{\\text{The following "
                                       "content is not yet on crowdin, check "
                                       "back in a week}}}$\n\n%s" %
                                       part[attr_or_index])
            else:
                translatable_parts.append(text)
        elif text:
            # No alpha characters, but lets see if it has any numbers in it
            # that we could automatically translate.
            part[attr_or_index] = _PERSEUS_NUMBER_RE.sub(_translate_number,
                                                         text)

    item_data = json.loads(item["item_data"])

    assessment_items.models.AssessmentItem.traverse_natural_language_parts(
        item_data, _maybe_translate)

    # If we are on translate.khanacademy.org we want to calculate the
    # percentage based upon crowdins jipt (Just in place translation) locale
    check_translated_locale = jipt_locale if jipt_locale else mo_locale

    if translatable_parts:
        parts_translated = sum(
            [i18n.has_translation(check_translated_locale, text)
             for text in translatable_parts])
        item["percent_translated"] = (
            parts_translated / float(len(translatable_parts))) * 100
    else:
        # If there is nothing to translate in this part we assume it is 100%
        # translated
        item["percent_translated"] = 100.0

    item["item_data"] = json.dumps(item_data)

    return item


def is_fully_translated(assessment_item):
    """Determine if an assessment item is fully translated in request locale"""
    serialized_item = api.jsonify.as_serializable(assessment_item)
    translated_item = translate_serialized_assessment_item(serialized_item)
    return translated_item.get("percent_translated", 100.0) == 100.0

## models.py
"""Holds AssessmentItem and related entities.

AssessmentItem: database entity about a single assessment item that can be
    included in exercises, and other things in the future

AssessmentItemVersion: database entity about a single version of an assessment
    item for which it is a child
"""

import os

from google.appengine.ext import db
from google.appengine.ext import ndb

import api.jsonify
import backup_model
import compat_key
import content
import datetime
import db_util
import layer_cache
import setting_model


@db_util.disable_ndb_memcache
class AssessmentItemTag(backup_model.BackupModelNDB):
    _serialize_whitelist = ['id', 'description', 'display_name']

    description = ndb.StringProperty()
    display_name = ndb.StringProperty()
    deleted = ndb.BooleanProperty(indexed=True, default=False)

    @property
    def id(self):
        return self.key.urlsafe()

    def validate(self):
        # TODO(cbhl): Prevent duplicate tags from being created.
        pass

    @staticmethod
    def create(**kwargs):
        """Create and return a new tag, without putting it to the datastore."""
        # Keep all the tags in the same entity group to get strong consistency
        item = AssessmentItemTag(parent=ndb.Key(AssessmentItemTag, '0'),
                                 **kwargs)
        item.validate()

        return item

    @staticmethod
    def query_all():
        return AssessmentItemTag.query(AssessmentItemTag.deleted == False,
            ancestor=ndb.Key(AssessmentItemTag, '0'))

    @staticmethod
    @layer_cache.cache_with_key_params_fxn(
        lambda: setting_model.Setting.cached_assessment_item_tags_date,
        cache_separately_per_display_language=False)
    def fetch_all():
        return AssessmentItemTag.query_all().fetch()

    @staticmethod
    def find_by_display_name(display_name):
        # TODO(alpert): Filter by deleted here?
        return AssessmentItemTag.query(
            AssessmentItemTag.display_name == display_name,
            ancestor=ndb.Key(AssessmentItemTag, '0')).get()


class BaseAssessmentItem(object):
    content_kind = "AssessmentItem"
    content_kind_code = "i"

    item_data = ndb.StringProperty(indexed=False)

    author_names = ndb.StringProperty(indexed=False, repeated=True)
    created_by = ndb.KeyProperty(indexed=True)

    tags = ndb.KeyProperty(indexed=True, repeated=True, kind=AssessmentItemTag)

    # Name used to refer to this item in the perseus interface
    name = ndb.StringProperty(indexed=False)

    @staticmethod
    def random_id():
        """Generate a random unique identifier for a future new entity.

        We override the implementation from BaseRevision because we expect to
        have many thousands of items and so the probability of a collision with
        32-bit keys is significant."""
        return 'x' + os.urandom(8).encode('hex')

    @staticmethod
    def traverse_item_data(item_data, handle_content, handle_widget):
        # We parse item_data using the schema at:
        # https://gist.github.com/alopatin/96d923d57bcfb7e8bf4e
        # If you change this code make sure to change the corresponding
        # translating code in api/v1_assessment_items.py
        handle_content(item_data.get("question", {}))

        for hint in item_data.get("hints", []):
            handle_content(hint)

        answer = item_data.get("answerArea", {})
        if answer.get("type") == "multiple":
            handle_content(answer.get("options", {}))
        else:
            handle_widget(answer)

    @staticmethod
    def traverse_natural_language_parts(item_data, handler):
        """Iterate over all the natural language parts in item_data

        This will call handler(part, attr_or_index), where the part will either
        be a dict and the second arg the key to the natural language within
        that dict, or part will be a list and second arg the index to the
        natural language part within the list.  This allows the handler
        function to change the part[attr_or_index] changing its value in memory

        We parse the item_data using the schema at:
        https://gist.github.com/alopatin/96d923d57bcfb7e8bf4e
        If we change that schema, by say adding a new widget, we will need to
        update this function.
        """

        def handle_list_items(dict_part, attr):
            part = dict_part.get(attr, [])
            for index in xrange(0, len(part)):
                handler(part, index)

        def handle_content_in_list_items(dict_part, attr):
            parts = dict_part.get(attr, [])
            for part in parts:
                if part.get("content"):
                    handler(part, "content")

        def handle_widget(widget):
            options = widget.get("options", {})
            if widget.get("type") in ["radio", "dropdown"]:
                handle_content_in_list_items(options, "choices")

            elif widget.get("type") == "categorization":
                handle_content_in_list_items(options, "items")
                handle_list_items(options, "categoryHeaders")

            elif widget.get("type") == "plotter":
                handle_list_items(options, "labels")
                handle_list_items(options, "categories")

            elif widget.get("type") == "orderer":
                handle_content_in_list_items(options, "options")
                handle_content_in_list_items(options, "correctOptions")
                handle_content_in_list_items(options, "otherOptions")

            elif widget.get("type") == "matcher":
                handle_list_items(options, "labels")
                handle_list_items(options, "left")
                handle_list_items(options, "right")

            elif widget.get("type") == "sorter":
                handle_list_items(options, "correct")

            elif widget.get("type") == "image":
                handle_content_in_list_items(options, "labels")

        def handle_renderer(renderer):
            if renderer.get("content"):
                handler(renderer, "content")
            for widget in renderer.get("widgets", {}).itervalues():
                handle_widget(widget)

        AssessmentItem.traverse_item_data(
            item_data,
            lambda x: handle_renderer(x),
            lambda x: handle_widget(x))

    def preview_relative_url(self):
        return  "/preview/content/items/%s" % self.key.id()

    def preview_relative_url_in_exercise(self, exercise):
        """Return a url that contains all items in exercise jumping to this one

        This presumes that the item is in the exercise passed in.  If it is not
        that it will just go to the first assessment item in that exercise.
        """
        return  "/preview/content/items?exercises=%s#%s" % (exercise.name,
                                                            self.key.id())

@db_util.disable_ndb_memcache
class AssessmentItem(BaseAssessmentItem,
                     content.models.VersionedContentNDB):
    """Information about a single assessment item."""

    # Metaclass is needed to allow inheriting DB properties from
    # BaseAssessmentItem
    __metaclass__ = db_util.NDBInheritModelPropertiesType

    _serialize_whitelist = [
        'id', 'creation_date', 'item_data', 'author_names',
        'tags', 'sha', 'name']

    edit_revision = ndb.KeyProperty(kind='AssessmentItemRevision',
        indexed=False)

    is_dirty = ndb.ComputedProperty(lambda self:
            self.edit_revision and self.edit_revision.id() != self.sha)

    def get_edit_revision(self):
        return self.get_edit_revision_async().get_result()

    def get_edit_revision_async(self):
        if self.edit_revision:
            return self.edit_revision.get_async()
        else:
            return self.get_current_revision_async()

    def get_revisions(self):
        return (AssessmentItemRevision.query()
            .filter(AssessmentItemRevision.content_id == self.id).fetch())

    @staticmethod
    def create_sorted_revision_list(revisions):
        def date_sort(revision):
            return revision.creation_date or datetime.datetime.min

        sorted_revisions = sorted(revisions, key=date_sort, reverse=True)

        # Fetch revision creators
        created_by_keys = set()

        for revision in revisions:
            if revision.created_by:
                created_by_keys.add(compat_key.from_(revision.created_by).db)

        created_by = {user.key(): user for user in
                db.get(list(created_by_keys))}

        def get_created_by(revision):
            if revision.created_by:
                creator = created_by[compat_key.from_(revision.created_by).db]
                return creator.nickname
            else:
                return ""

        def get_creation_date(revision):
            if revision.creation_date:
                return revision.creation_date.date().isoformat()
            else:
                return ""

        return [{
            "created_by": get_created_by(revision),
            "sha": revision.sha,
            "creation_date": get_creation_date(revision),
        } for revision in sorted_revisions]


@db_util.disable_ndb_memcache
class AssessmentItemRevision(BaseAssessmentItem,
                             content.models.BaseRevisionNDB):
    """Information about a single version of an assessment item."""

    # Metaclass is needed to allow inheriting DB properties from
    # BaseAssessmentItem
    __metaclass__ = db_util.NDBInheritModelPropertiesType

    @classmethod
    def fixup_after_deserialize(cls, properties):
        super(AssessmentItemRevision, cls).fixup_after_deserialize(properties)

        # Allow item_data to be passed up as a JSON object or string
        # TODO(joel) - potentially remove this in the future
        if ("item_data" in properties and
                type(properties["item_data"]) == dict):
            properties["item_data"] = api.jsonify.jsonify(
                properties["item_data"])

        # Convert string keys to actual keys
        if "tags" in properties:
            if properties["tags"] is not None:
                properties["tags"] = [compat_key.from_(key).ndb for key in
                    properties["tags"]]
            else:
                properties["tags"] = []

    # Because AssessmentItems are never published, getting
    # AssessmentItemRevisions should be made convenient.
    @staticmethod
    def get_by_sha(sha):
        return content.models.BaseRevisionNDB.get_by_sha("AssessmentItem", sha)

    @staticmethod
    def get_by_sha_list(sha_list):
        key_list = [ndb.Key("AssessmentItemRevision", sha) for sha in sha_list]
        return compat_key.maybe_get_multi(key_list)

## translate_exercises.py
"""A Compile object (see compile_rule.py): translates khan-exercises files."""

import re

from kake import translate_javascript
from kake import translate_util
from kake.lib import compile_rule


class TranslateExercises(translate_util.TranslateBase):
    """Class for translating natural-language text in khan-exercises files."""

    _DATA_IF_RE = re.compile(r"data-if\s*=\s*('[^']*'|\"[^\"]*\"|[^'\">\s]+)")

    def version(self):
        """Update every time build() changes in a way that affects output."""
        return 1

    def _ngettext_html(self, text, tag_info, lang, translation):
        """Creates 'translated' html for an ngettext-like span.

        In exercises html, ngettext is implemented like so:
           <span data-if="isSingular(KEY)" foo=bar>xxx</span>
           <span data-else foo=bar>yyy</span>

        When translating into Polish, with 4 plural forms, we want to emit
           <span data-if="$.ngetpos(KEY, LANG) === 0" foo=bar>aaa</span>
           <span data-else-if="$.ngetpos(KEY, LANG) === 1" foo=bar>bbb</span>
           <span data-else-if="$.ngetpos(KEY, LANG) === 2" foo=bar>ccc</span>
           <span data-else-if="$.ngetpos(KEY, LANG) === 3" foo=bar>ddd</span>

        This routine returns a string consisting of the desired output.
        """
        old_starttag = text[tag_info.outer_startpos:tag_info.startpos]
        old_endtag = text[tag_info.endpos:tag_info.outer_endpos]
        ngetpos_key = tag_info.is_singular()
        data_if = self._DATA_IF_RE.search(old_starttag)
        assert ngetpos_key is not None, tag_info
        assert data_if, old_starttag
        assert translate_util.SmallMOFile.is_plural(translation), translation

        new_text = []
        for i in xrange(len(translation)):
            if i == 0:
                new_data_if = ('data-if="$.ngetpos(%s, \'%s\') === 0"'
                               % (ngetpos_key, lang))
            else:
                new_data_if = ('data-else-if="$.ngetpos(%s, \'%s\') === %d"'
                               % (ngetpos_key, lang, i))
            new_text.extend((old_starttag[:data_if.start()],
                             new_data_if,
                             old_starttag[data_if.end():],
                             translation[i],
                             old_endtag))
        return ''.join(new_text)

    def translate(self, infile_name, outfile_lang_moentries_context):
        # Import here so kake users who don't actually need the
        # translate_exercises rule can still import this module
        # without needing to bring in exercises.
        import exercises.babel

        file_contents = self._read_input(infile_name)

        # Get all the info about the scripts and nltext in this document.
        extractor = exercises.babel.I18nExtractor()
        extractor.feed(file_contents)
        nltext_info = list(extractor.nltext_nodes())
        nltext_in_attr_info = list(extractor.nltext_attribute_values())
        script_info = list(extractor.javascript_nodes())
        script_in_attr_info = list(extractor.javascript_attribute_values())

        js_translator = translate_javascript.TranslateJavascript()
        for (outfile, lang, mo_entries, _) in outfile_lang_moentries_context:
            # Keep track of the rewrites we're supposed to do.
            rewrites = []     # each entry is (startpos, endpos, new_text)

            # Handle javascript in, e.g., "<script>js</script>"
            for tag_info in script_info:
                # Use a javascript-injector to handle the javascript
                # inside the exercises.
                script_contents = file_contents[tag_info.startpos:
                                                tag_info.endpos]
                babel_output = js_translator.extract_nltext(script_contents)
                translated_js = js_translator.translate_to_lang(
                    babel_output, script_contents, lang, mo_entries)

                # Only inject the string if there is a translation
                if translated_js is not None:
                    rewrites.append((tag_info.startpos, tag_info.endpos,
                                     translated_js))

            # Handle javascript in, e.g., "<span data-if="js">...</span>"
            for (tag_info, attrval_string) in script_in_attr_info:
                # tag_info is the tag-info for the span, attrval_string is js.
                babel_output = js_translator.extract_nltext(attrval_string)
                translated_js = js_translator.translate_to_lang(
                    babel_output, attrval_string, lang, mo_entries)

                if translated_js is not None:
                    # We need to figure out where attrval_string is inside
                    # tag_info.  TODO(csilvers): have attrval_pos passed in.
                    attrval_pos = file_contents.index(attrval_string,
                                                      tag_info.outer_startpos,
                                                      tag_info.startpos)
                    rewrites.append((attrval_pos,
                                     attrval_pos + len(attrval_string),
                                     translated_js))

            # Handle text.
            last_was_singular = False   # used when we're injecting ngettext
            for tag_info in nltext_info:
                if last_was_singular:
                    # For ngettext, we want to replace the English
                    # singular + plural spans with a bunch of
                    # translated spans (one for each plural).  We
                    # already deleted the English singular and
                    # inserted the new stuff, so all that's left is to
                    # delete the English plural.
                    rewrites.append((tag_info.outer_startpos,
                                     tag_info.outer_endpos,
                                     ''))
                    last_was_singular = False
                    continue

                (cleaned_contents, _) = extractor.cleaned_text(tag_info)

                if tag_info.is_singular():
                    # An ngettext call.  We replace the English singular
                    # with a bunch of translated spans (one per plural).
                    translation = mo_entries.get_plural_translation(
                        cleaned_contents)
                    if translation:    # a translation was found
                        new_text = self._ngettext_html(file_contents, tag_info,
                                                       lang, translation)
                        rewrites.append((tag_info.outer_startpos,
                                         tag_info.outer_endpos,
                                         new_text))
                    last_was_singular = True
                else:
                    # A gettext call.  We can replace the text between
                    # the tags with translated text.
                    translation = mo_entries.get_singular_translation(
                        cleaned_contents)
                    if translation:
                        rewrites.append((tag_info.startpos, tag_info.endpos,
                                         translation))

            # Handle text in, e.g., "<input value="text">"
            for (tag_info, attrval_string) in nltext_in_attr_info:
                # tag_info is the tag-info for the input, attrval is text.
                translation = mo_entries.get_singular_translation(
                    attrval_string)
                if translation:
                    # We need to figure out where attrval_string is inside
                    # tag_info.  TODO(csilvers): have attrval_pos passed in.
                    attrval_pos = file_contents.index(attrval_string,
                                                      tag_info.outer_startpos,
                                                      tag_info.startpos)
                    rewrites.append((attrval_pos,
                                     attrval_pos + len(attrval_string),
                                     translation))

            # Small optimization to catch a case we know output is unchanged.
            if not rewrites:
                translated_contents = None
            else:
                # Construct the new file from the original file + rewrites.
                translated_contents = []
                rewrites.sort()     # get in startpos order
                i = 0
                for (startpos, endpos, new_text) in rewrites:
                    translated_contents.append(file_contents[i:startpos])
                    translated_contents.append(new_text)
                    i = endpos
                translated_contents.append(file_contents[i:])
                translated_contents = ''.join(translated_contents)
                if translated_contents == file_contents:
                    translated_contents = None   # signal that output == input

            self._write_output(infile_name, outfile, translated_contents)


compile_rule.register_compile(
    'TRANSLATED KA-EXERCISE',
    'genfiles/translations/{lang}/khan-exercises/{{path}}.html',
    ['khan-exercises/{{path}}.html',
     ('genfiles/extracted_strings/{lang}/'
      'khan-exercises/{{path}}.html.small_mo.pickle')],
    TranslateExercises(),
    # small_mo.pickle files are recreatd every time {lang}.po files
    # change, but their contents usually don't change, so crc's are
    # good for us.
    compute_crc=True)

## translate_javascript.py
"""A Compile object (see compile_rule.py): translates .js files."""

import cStringIO
import json
import re

import third_party.babel.messages.extract

import intl.data
import intl.english_only
from kake import translate_util
from kake.lib import compile_rule


class TranslateJavascript(translate_util.TranslateBase):
    """Class for translating natural-language text in javascript files."""
    _JS_GETTEXT_RE = re.compile(r'\$\._|\$\.ngettext|\$_')

    _BABEL_KW = third_party.babel.messages.extract.DEFAULT_KEYWORDS.copy()
    # <$_> in jsx expands to $_({varmap}, "string", ...), so kw-index is 2.
    _BABEL_KW['$_'] = (2,)                     # used in .jsx files as <$_>

    def version(self):
        """Update every time build() changes in a way that affects output."""
        return 1

    def extract_nltext(self, file_contents):
        # Extract the messages from the JavaScript file with the
        # appropriate start and end positions as well.  As a small
        # efficiency short-cut, we can say that if none of ($._,
        # $.ngettext, $_) are present in file_contents, there won't be
        # any translations to do.
        if not self._JS_GETTEXT_RE.search(file_contents):
            return []
        else:
            # Convert it to a StringIO buffer for pybabel to handle.
            # pybabel expects utf-8 encoded input, so that's what we give it.
            input = cStringIO.StringIO(file_contents.encode('utf-8'))
            r = third_party.babel.messages.extract.extract_javascript(
                input, self._BABEL_KW, [], {'messages_only': True})
            return list(r)    # convert from iterator if need be

    def translate_to_lang(self, babel_output, file_contents, lang, mo_entries):
        # Keep track of if the translated file differs from the original.
        has_diff = False

        # Go through all of the matched messages in reverse (to avoid
        # having to deal with the changes in position of the messages).
        for (messages, start, end) in reversed(babel_output):
            # Figure out the lookup key.  extract_javascript returns a
            # list if a plural is found.
            key = messages
            if isinstance(key, basestring):      # singular
                key = messages
                translation = mo_entries.get_singular_translation(key)
                if not translation:
                    continue
                insert_text = json.dumps(translation)
            elif messages[0] is None:            # jsx-style $_()
                # For the jsx $_() operator, the gettext string is the
                # *second* argument (the first argument is the value-dict).
                key = messages[1]
                translation = mo_entries.get_singular_translation(key)
                if not translation:
                    continue
                insert_text = json.dumps(translation)
            else:                                # plural
                key = messages[0]
                translation_dict = mo_entries.get_plural_translation(key)
                if not translation_dict:
                    continue
                # We need the messages to be sorted by index.
                index_and_messages = sorted(translation_dict.iteritems())
                messages = [message for (_, message) in index_and_messages]
                # We store the info we need in legal javascript format.
                insert_text = json.dumps({"lang": lang, "messages": messages})

            # Insert the string at the right position.
            file_contents = ''.join((file_contents[:start],
                                     insert_text,
                                     file_contents[end:]))

            # A change was made to the file.
            has_diff = True

        if has_diff:
            return file_contents
        else:
            return None   # signals that output == input

    def translate(self, infile_name, outfile_lang_moentries_context):
        if intl.english_only.should_not_translate(infile_name):
            # If we shouldn't translate it, we can just symlink it!
            for (outfile, _, _, _) in outfile_lang_moentries_context:
                self._write_output(infile_name, outfile, None)
            return

        file_contents = self._read_input(infile_name)

        babel_output = self.extract_nltext(file_contents)

        for (outfile, lang, mo_entries, _) in outfile_lang_moentries_context:
            # Get the translation, or None if output == input.
            translated_contents = self.translate_to_lang(
                babel_output, file_contents, lang, mo_entries)

            self._write_output(infile_name, outfile, translated_contents)


# These rules are only used in dev (where we don't compress js), and
# for js worker files, which we likewise translate without compressing.
compile_rule.register_compile(
    'TRANSLATED RAW JS FILES',
    'genfiles/translations/{lang}/{{path}}.js',
    ['{{path}}.js',
     'genfiles/extracted_strings/{lang}/{{path}}.js.small_mo.pickle'],
    TranslateJavascript(),
    # small_mo.pickle files are recreatd every time {lang}.po files
    # change, but their contents usually don't change, so crc's are
    # good for us.
    compute_crc=True)


# This catches files that are compiled (or transpiled) into js.
dirs_with_js = ('genfiles/compiled_{type}',
                # This is a special case (bundle.js has its own directory)
                'genfiles/khan-exercises',
                )
for d in dirs_with_js:
    translate_util.register_translatesafe_compile(
        'TRANSLATED JS FILES (%s)' % d,
        '%s/{lang}/{{path}}.js' % d,
        ['%s/en/{{path}}.js' % d,
         ('genfiles/extracted_strings/{lang}/'
          '%s/{lang}/{{path}}.js.small_mo.pickle' % d)],
        TranslateJavascript(),
        compute_crc=True)


# This is the rule used in prod, where we only translate javascript
# after it's been compressed.  The exception is handlebars files,
# which are translated before they're even converted to javascript, in
# compile_handlebars.py.  Luckily, the special-case rule for the
# handlebars files (in compress_js.py) has higher precedence than this
# rule, so we can be fully general here.
# We use 'trumped_by' to make sure this rule doesn't apply when lang=en,
# and also to make sure this rule doesn't apply when translating handlebars.
translate_util.register_translatesafe_compile(
    'TRANSLATED COMPRESSED JS FILES',
    'genfiles/compressed_javascript/{lang}/{{path}}.min.js',
    ['genfiles/compressed_javascript/en/{{path}}.min.js',
     ('genfiles/extracted_strings/{lang}/genfiles/compressed_javascript/{lang}'
      '/{{path}}.min.js.small_mo.pickle')],
    TranslateJavascript(),
    trumped_by=['COMPRESSED JS', 'COMPRESSED TRANSLATED HANDLEBARS JS'],
    compute_crc=True,
)
	"""Holds tools needed to translate assessment items."""

	import copy
	import json
	import re

	import api.jsonify
	import assessment_items.models
	from intl import i18n
	import intl.regexps
	import intl.request

	# Matches a number inside perseus text with thousands group separator and
	# decimal point. ie 9,100.3
	# Inside $...$ though the commas should have surrounding curly braces. ie.
	# $9{,}100.3$
	_PERSEUS_NUMBER_RE = re.compile('\d+((\{?,\}?)\d{3})\.?\d', re.UNICODE)

	# After we detect a perseus number string we need to remove the curly braces
	# and the comma if we are to cast it to a float
	_BAD_FLOAT_CHARS_RE = re.compile('[\{\},]', re.UNICODE)

	# In order to detect whether we are inside of outside of $...$ we need to first
	# find out how many non-escaped dollar signs come before us. "I owe \$5,000"
	# would not match anything since the dollar sign is escaped. Theoretically the
	# slash before hand could also be escaped ie. "\\$5{,}000$" in which case the
	# dollar sign is for real. For completeness we check for that too.
	_NON_ESCAPED_DOLLAR_SIGN_RE = re.compile(r"(?<!\\)(?:\\\\)*\$", re.UNICODE)


	def _translate_number(match):
	"""Translate a number in perseus text to match request language's # format.

	Different langauges use different decimal and group separators. 1,234.56
	in English is translated to 1.234,56 in Spanish. If the number appears
	within TeX then curly braces are added to surround the comma.

	This function will preserve the number of decimal places shown in the
	English version. so 5.00 is translated to 5,00 in Spanish.
	"""
	# TODO(james): Merge this with a similar function in download_i18n.py

	# Get the default number pattern for the current language
	decimal_format = i18n.request_language_decimal_format()

	if not decimal_format:
	# Fake languages (ie. boxes & accents) won't have a decimal format.
	# So we leave the number untranslated
	# TODO(james): consider translating these on the fly as well
	return match.group(0)

	# We need to remove any curly braces or commas from the match if we are to
	# be able to cast this number_string to a float.
	number_string = _BAD_FLOAT_CHARS_RE.sub("", match.group(0))

	# Find the precision of the fractional part of the number
	# ie. for 1.240 -> 3
	frac_prec = 0
	decimal_pos = number_string.find(".")
	if decimal_pos >= 0:
	frac_prec = len(number_string) - (decimal_pos + 1)

	# Change the decimal format precision (min, max) so they both match
	# the precision of the fractional part in the English version to ensure we
	# return a string with the same precision. Since we're changing
	# global state we make a copy first.
	decimal_format = copy.copy(decimal_format)
	decimal_format.frac_prec = (frac_prec, frac_prec)

	# TODO(james): format_decimal rounds numbers hence "$5.00" -> "$5"
	# Figure out how to not get it to round, or to add significant figures
	# back in afterwards
	translated_number = unicode(i18n.format_decimal(float(number_string),
	decimal_format))

	# We want to add {} around the number if it is inside $..$ but not if it is
	# outside. Unlike where we fully parse the text in download_i18n.py
	# automatic number translation, since we have no alpha chachters we can
	# assume we are inside $...$ if there are an odd number of non-escaped $s
	# before the match.
	num_dollar_signs = len(_NON_ESCAPED_DOLLAR_SIGN_RE.findall(
	match.string[:match.start()]))
	if num_dollar_signs % 2 == 1:
	return translated_number.replace(",", "{,}")

	return translated_number


	def translate_serialized_assessment_item(item):
	mo_locale = intl.request.locale_for_mo()
	if mo_locale == "en":
	# If we are in English don't bother translating as this was the
	# language the content was written in.
	return item

	# keep track of which parts have been translated
	translatable_parts = []
	# crowdin's jipt (just in place translation) locale used on
	# translate.khanacademy.org
	jipt_locale = intl.request.jipt_locale_for_mo()

	def _maybe_translate(part, attr_or_index):
	"""Translate in place all natural language text.

	This replaces in memory
	the value of attr in the dict part or
	the value in index within a list part
	with its translation.
	"""
	text = part[attr_or_index]

	if text and not intl.regexps.NO_NEED_TO_TRANSLATE.match(text):
	part[attr_or_index] = i18n._(text) # @Nolint - it is ok to be
	# translating a variable here.
	if jipt_locale and "crwdns" not in part[attr_or_index]:
	# We are on translate.ka.org but the string doesn't seem to
	# have the jipt tags, so we add a warning message to the user,
	# and we don't count this as a translatable part.
	part[attr_or_index] = ("$\\large{\\red{\\text{The following "
	"content is not yet on crowdin, check "
	"back in a week}}}$\n\n%s" %
	part[attr_or_index])
	else:
	translatable_parts.append(text)
	elif text:
	# No alpha characters, but lets see if it has any numbers in it
	# that we could automatically translate.
	part[attr_or_index] = _PERSEUS_NUMBER_RE.sub(_translate_number,
	text)

	item_data = json.loads(item["item_data"])

	assessment_items.models.AssessmentItem.traverse_natural_language_parts(
	item_data, _maybe_translate)

	# If we are on translate.khanacademy.org we want to calculate the
	# percentage based upon crowdins jipt (Just in place translation) locale
	check_translated_locale = jipt_locale if jipt_locale else mo_locale

	if translatable_parts:
	parts_translated = sum(
	[i18n.has_translation(check_translated_locale, text)
	for text in translatable_parts])
	item["percent_translated"] = (
	parts_translated / float(len(translatable_parts))) * 100
	else:
	# If there is nothing to translate in this part we assume it is 100%
	# translated
	item["percent_translated"] = 100.0

	item["item_data"] = json.dumps(item_data)

	return item


	def is_fully_translated(assessment_item):
	"""Determine if an assessment item is fully translated in request locale"""
	serialized_item = api.jsonify.as_serializable(assessment_item)
	translated_item = translate_serialized_assessment_item(serialized_item)
	return translated_item.get("percent_translated", 100.0) == 100.0
	"""Holds AssessmentItem and related entities.

	AssessmentItem: database entity about a single assessment item that can be
	included in exercises, and other things in the future

	AssessmentItemVersion: database entity about a single version of an assessment
	item for which it is a child
	"""

	import os

	from google.appengine.ext import db
	from google.appengine.ext import ndb

	import api.jsonify
	import backup_model
	import compat_key
	import content
	import datetime
	import db_util
	import layer_cache
	import setting_model


	@db_util.disable_ndb_memcache
	class AssessmentItemTag(backup_model.BackupModelNDB):
	_serialize_whitelist = ['id', 'description', 'display_name']

	description = ndb.StringProperty()
	display_name = ndb.StringProperty()
	deleted = ndb.BooleanProperty(indexed=True, default=False)

	@property
	def id(self):
	return self.key.urlsafe()

	def validate(self):
	# TODO(cbhl): Prevent duplicate tags from being created.
	pass

	@staticmethod
	def create(**kwargs):
	"""Create and return a new tag, without putting it to the datastore."""
	# Keep all the tags in the same entity group to get strong consistency
	item = AssessmentItemTag(parent=ndb.Key(AssessmentItemTag, '0'),
	**kwargs)
	item.validate()

	return item

	@staticmethod
	def query_all():
	return AssessmentItemTag.query(AssessmentItemTag.deleted == False,
	ancestor=ndb.Key(AssessmentItemTag, '0'))

	@staticmethod
	@layer_cache.cache_with_key_params_fxn(
	lambda: setting_model.Setting.cached_assessment_item_tags_date,
	cache_separately_per_display_language=False)
	def fetch_all():
	return AssessmentItemTag.query_all().fetch()

	@staticmethod
	def find_by_display_name(display_name):
	# TODO(alpert): Filter by deleted here?
	return AssessmentItemTag.query(
	AssessmentItemTag.display_name == display_name,
	ancestor=ndb.Key(AssessmentItemTag, '0')).get()


	class BaseAssessmentItem(object):
	content_kind = "AssessmentItem"
	content_kind_code = "i"

	item_data = ndb.StringProperty(indexed=False)

	author_names = ndb.StringProperty(indexed=False, repeated=True)
	created_by = ndb.KeyProperty(indexed=True)

	tags = ndb.KeyProperty(indexed=True, repeated=True, kind=AssessmentItemTag)

	# Name used to refer to this item in the perseus interface
	name = ndb.StringProperty(indexed=False)

	@staticmethod
	def random_id():
	"""Generate a random unique identifier for a future new entity.

	We override the implementation from BaseRevision because we expect to
	have many thousands of items and so the probability of a collision with
	32-bit keys is significant."""
	return 'x' + os.urandom(8).encode('hex')

	@staticmethod
	def traverse_item_data(item_data, handle_content, handle_widget):
	# We parse item_data using the schema at:
	# https://gist.github.com/alopatin/96d923d57bcfb7e8bf4e
	# If you change this code make sure to change the corresponding
	# translating code in api/v1_assessment_items.py
	handle_content(item_data.get("question", {}))

	for hint in item_data.get("hints", []):
	handle_content(hint)

	answer = item_data.get("answerArea", {})
	if answer.get("type") == "multiple":
	handle_content(answer.get("options", {}))
	else:
	handle_widget(answer)

	@staticmethod
	def traverse_natural_language_parts(item_data, handler):
	"""Iterate over all the natural language parts in item_data

	This will call handler(part, attr_or_index), where the part will either
	be a dict and the second arg the key to the natural language within
	that dict, or part will be a list and second arg the index to the
	natural language part within the list. This allows the handler
	function to change the part[attr_or_index] changing its value in memory

	We parse the item_data using the schema at:
	https://gist.github.com/alopatin/96d923d57bcfb7e8bf4e
	If we change that schema, by say adding a new widget, we will need to
	update this function.
	"""

	def handle_list_items(dict_part, attr):
	part = dict_part.get(attr, [])
	for index in xrange(0, len(part)):
	handler(part, index)

	def handle_content_in_list_items(dict_part, attr):
	parts = dict_part.get(attr, [])
	for part in parts:
	if part.get("content"):
	handler(part, "content")

	def handle_widget(widget):
	options = widget.get("options", {})
	if widget.get("type") in ["radio", "dropdown"]:
	handle_content_in_list_items(options, "choices")

	elif widget.get("type") == "categorization":
	handle_content_in_list_items(options, "items")
	handle_list_items(options, "categoryHeaders")

	elif widget.get("type") == "plotter":
	handle_list_items(options, "labels")
	handle_list_items(options, "categories")

	elif widget.get("type") == "orderer":
	handle_content_in_list_items(options, "options")
	handle_content_in_list_items(options, "correctOptions")
	handle_content_in_list_items(options, "otherOptions")

	elif widget.get("type") == "matcher":
	handle_list_items(options, "labels")
	handle_list_items(options, "left")
	handle_list_items(options, "right")

	elif widget.get("type") == "sorter":
	handle_list_items(options, "correct")

	elif widget.get("type") == "image":
	handle_content_in_list_items(options, "labels")

	def handle_renderer(renderer):
	if renderer.get("content"):
	handler(renderer, "content")
	for widget in renderer.get("widgets", {}).itervalues():
	handle_widget(widget)

	AssessmentItem.traverse_item_data(
	item_data,
	lambda x: handle_renderer(x),
	lambda x: handle_widget(x))

	def preview_relative_url(self):
	return "/preview/content/items/%s" % self.key.id()

	def preview_relative_url_in_exercise(self, exercise):
	"""Return a url that contains all items in exercise jumping to this one

	This presumes that the item is in the exercise passed in. If it is not
	that it will just go to the first assessment item in that exercise.
	"""
	return "/preview/content/items?exercises=%s#%s" % (exercise.name,
	self.key.id())

	@db_util.disable_ndb_memcache
	class AssessmentItem(BaseAssessmentItem,
	content.models.VersionedContentNDB):
	"""Information about a single assessment item."""

	# Metaclass is needed to allow inheriting DB properties from
	# BaseAssessmentItem
	__metaclass__ = db_util.NDBInheritModelPropertiesType

	_serialize_whitelist = [
	'id', 'creation_date', 'item_data', 'author_names',
	'tags', 'sha', 'name']

	edit_revision = ndb.KeyProperty(kind='AssessmentItemRevision',
	indexed=False)

	is_dirty = ndb.ComputedProperty(lambda self:
	self.edit_revision and self.edit_revision.id() != self.sha)

	def get_edit_revision(self):
	return self.get_edit_revision_async().get_result()

	def get_edit_revision_async(self):
	if self.edit_revision:
	return self.edit_revision.get_async()
	else:
	return self.get_current_revision_async()

	def get_revisions(self):
	return (AssessmentItemRevision.query()
	.filter(AssessmentItemRevision.content_id == self.id).fetch())

	@staticmethod
	def create_sorted_revision_list(revisions):
	def date_sort(revision):
	return revision.creation_date or datetime.datetime.min

	sorted_revisions = sorted(revisions, key=date_sort, reverse=True)

	# Fetch revision creators
	created_by_keys = set()

	for revision in revisions:
	if revision.created_by:
	created_by_keys.add(compat_key.from_(revision.created_by).db)

	created_by = {user.key(): user for user in
	db.get(list(created_by_keys))}

	def get_created_by(revision):
	if revision.created_by:
	creator = created_by[compat_key.from_(revision.created_by).db]
	return creator.nickname
	else:
	return ""

	def get_creation_date(revision):
	if revision.creation_date:
	return revision.creation_date.date().isoformat()
	else:
	return ""

	return [{
	"created_by": get_created_by(revision),
	"sha": revision.sha,
	"creation_date": get_creation_date(revision),
	} for revision in sorted_revisions]


	@db_util.disable_ndb_memcache
	class AssessmentItemRevision(BaseAssessmentItem,
	content.models.BaseRevisionNDB):
	"""Information about a single version of an assessment item."""

	# Metaclass is needed to allow inheriting DB properties from
	# BaseAssessmentItem
	__metaclass__ = db_util.NDBInheritModelPropertiesType

	@classmethod
	def fixup_after_deserialize(cls, properties):
	super(AssessmentItemRevision, cls).fixup_after_deserialize(properties)

	# Allow item_data to be passed up as a JSON object or string
	# TODO(joel) - potentially remove this in the future
	if ("item_data" in properties and
	type(properties["item_data"]) == dict):
	properties["item_data"] = api.jsonify.jsonify(
	properties["item_data"])

	# Convert string keys to actual keys
	if "tags" in properties:
	if properties["tags"] is not None:
	properties["tags"] = [compat_key.from_(key).ndb for key in
	properties["tags"]]
	else:
	properties["tags"] = []

	# Because AssessmentItems are never published, getting
	# AssessmentItemRevisions should be made convenient.
	@staticmethod
	def get_by_sha(sha):
	return content.models.BaseRevisionNDB.get_by_sha("AssessmentItem", sha)

	@staticmethod
	def get_by_sha_list(sha_list):
	key_list = [ndb.Key("AssessmentItemRevision", sha) for sha in sha_list]
	return compat_key.maybe_get_multi(key_list)
	"""A Compile object (see compile_rule.py): translates khan-exercises files."""

	import re

	from kake import translate_javascript
	from kake import translate_util
	from kake.lib import compile_rule


	class TranslateExercises(translate_util.TranslateBase):
	"""Class for translating natural-language text in khan-exercises files."""

	_DATA_IF_RE = re.compile(r"data-if\s=\s('[^']'\|\"[^\"]\"\|[^'\">\s]+)")

	def version(self):
	"""Update every time build() changes in a way that affects output."""
	return 1

	def _ngettext_html(self, text, tag_info, lang, translation):
	"""Creates 'translated' html for an ngettext-like span.

	In exercises html, ngettext is implemented like so:
	<span data-if="isSingular(KEY)" foo=bar>xxx</span>
	<span data-else foo=bar>yyy</span>

	When translating into Polish, with 4 plural forms, we want to emit
	<span data-if="$.ngetpos(KEY, LANG) === 0" foo=bar>aaa</span>
	<span data-else-if="$.ngetpos(KEY, LANG) === 1" foo=bar>bbb</span>
	<span data-else-if="$.ngetpos(KEY, LANG) === 2" foo=bar>ccc</span>
	<span data-else-if="$.ngetpos(KEY, LANG) === 3" foo=bar>ddd</span>

	This routine returns a string consisting of the desired output.
	"""
	old_starttag = text[tag_info.outer_startpos:tag_info.startpos]
	old_endtag = text[tag_info.endpos:tag_info.outer_endpos]
	ngetpos_key = tag_info.is_singular()
	data_if = self._DATA_IF_RE.search(old_starttag)
	assert ngetpos_key is not None, tag_info
	assert data_if, old_starttag
	assert translate_util.SmallMOFile.is_plural(translation), translation

	new_text = []
	for i in xrange(len(translation)):
	if i == 0:
	new_data_if = ('data-if="$.ngetpos(%s, \'%s\') === 0"'
	% (ngetpos_key, lang))
	else:
	new_data_if = ('data-else-if="$.ngetpos(%s, \'%s\') === %d"'
	% (ngetpos_key, lang, i))
	new_text.extend((old_starttag[:data_if.start()],
	new_data_if,
	old_starttag[data_if.end():],
	translation[i],
	old_endtag))
	return ''.join(new_text)

	def translate(self, infile_name, outfile_lang_moentries_context):
	# Import here so kake users who don't actually need the
	# translate_exercises rule can still import this module
	# without needing to bring in exercises.
	import exercises.babel

	file_contents = self._read_input(infile_name)

	# Get all the info about the scripts and nltext in this document.
	extractor = exercises.babel.I18nExtractor()
	extractor.feed(file_contents)
	nltext_info = list(extractor.nltext_nodes())
	nltext_in_attr_info = list(extractor.nltext_attribute_values())
	script_info = list(extractor.javascript_nodes())
	script_in_attr_info = list(extractor.javascript_attribute_values())

	js_translator = translate_javascript.TranslateJavascript()
	for (outfile, lang, mo_entries, _) in outfile_lang_moentries_context:
	# Keep track of the rewrites we're supposed to do.
	rewrites = [] # each entry is (startpos, endpos, new_text)

	# Handle javascript in, e.g., "<script>js</script>"
	for tag_info in script_info:
	# Use a javascript-injector to handle the javascript
	# inside the exercises.
	script_contents = file_contents[tag_info.startpos:
	tag_info.endpos]
	babel_output = js_translator.extract_nltext(script_contents)
	translated_js = js_translator.translate_to_lang(
	babel_output, script_contents, lang, mo_entries)

	# Only inject the string if there is a translation
	if translated_js is not None:
	rewrites.append((tag_info.startpos, tag_info.endpos,
	translated_js))

	# Handle javascript in, e.g., "<span data-if="js">...</span>"
	for (tag_info, attrval_string) in script_in_attr_info:
	# tag_info is the tag-info for the span, attrval_string is js.
	babel_output = js_translator.extract_nltext(attrval_string)
	translated_js = js_translator.translate_to_lang(
	babel_output, attrval_string, lang, mo_entries)

	if translated_js is not None:
	# We need to figure out where attrval_string is inside
	# tag_info. TODO(csilvers): have attrval_pos passed in.
	attrval_pos = file_contents.index(attrval_string,
	tag_info.outer_startpos,
	tag_info.startpos)
	rewrites.append((attrval_pos,
	attrval_pos + len(attrval_string),
	translated_js))

	# Handle text.
	last_was_singular = False # used when we're injecting ngettext
	for tag_info in nltext_info:
	if last_was_singular:
	# For ngettext, we want to replace the English
	# singular + plural spans with a bunch of
	# translated spans (one for each plural). We
	# already deleted the English singular and
	# inserted the new stuff, so all that's left is to
	# delete the English plural.
	rewrites.append((tag_info.outer_startpos,
	tag_info.outer_endpos,
	''))
	last_was_singular = False
	continue

	(cleaned_contents, _) = extractor.cleaned_text(tag_info)

	if tag_info.is_singular():
	# An ngettext call. We replace the English singular
	# with a bunch of translated spans (one per plural).
	translation = mo_entries.get_plural_translation(
	cleaned_contents)
	if translation: # a translation was found
	new_text = self._ngettext_html(file_contents, tag_info,
	lang, translation)
	rewrites.append((tag_info.outer_startpos,
	tag_info.outer_endpos,
	new_text))
	last_was_singular = True
	else:
	# A gettext call. We can replace the text between
	# the tags with translated text.
	translation = mo_entries.get_singular_translation(
	cleaned_contents)
	if translation:
	rewrites.append((tag_info.startpos, tag_info.endpos,
	translation))

	# Handle text in, e.g., "<input value="text">"
	for (tag_info, attrval_string) in nltext_in_attr_info:
	# tag_info is the tag-info for the input, attrval is text.
	translation = mo_entries.get_singular_translation(
	attrval_string)
	if translation:
	# We need to figure out where attrval_string is inside
	# tag_info. TODO(csilvers): have attrval_pos passed in.
	attrval_pos = file_contents.index(attrval_string,
	tag_info.outer_startpos,
	tag_info.startpos)
	rewrites.append((attrval_pos,
	attrval_pos + len(attrval_string),
	translation))

	# Small optimization to catch a case we know output is unchanged.
	if not rewrites:
	translated_contents = None
	else:
	# Construct the new file from the original file + rewrites.
	translated_contents = []
	rewrites.sort() # get in startpos order
	i = 0
	for (startpos, endpos, new_text) in rewrites:
	translated_contents.append(file_contents[i:startpos])
	translated_contents.append(new_text)
	i = endpos
	translated_contents.append(file_contents[i:])
	translated_contents = ''.join(translated_contents)
	if translated_contents == file_contents:
	translated_contents = None # signal that output == input

	self._write_output(infile_name, outfile, translated_contents)


	compile_rule.register_compile(
	'TRANSLATED KA-EXERCISE',
	'genfiles/translations/{lang}/khan-exercises/{{path}}.html',
	['khan-exercises/{{path}}.html',
	('genfiles/extracted_strings/{lang}/'
	'khan-exercises/{{path}}.html.small_mo.pickle')],
	TranslateExercises(),
	# small_mo.pickle files are recreatd every time {lang}.po files
	# change, but their contents usually don't change, so crc's are
	# good for us.
	compute_crc=True)
	"""A Compile object (see compile_rule.py): translates .js files."""

	import cStringIO
	import json
	import re

	import third_party.babel.messages.extract

	import intl.data
	import intl.english_only
	from kake import translate_util
	from kake.lib import compile_rule


	class TranslateJavascript(translate_util.TranslateBase):
	"""Class for translating natural-language text in javascript files."""
	_JS_GETTEXT_RE = re.compile(r'\$\._\|\$\.ngettext\|\$_')

	_BABEL_KW = third_party.babel.messages.extract.DEFAULT_KEYWORDS.copy()
	# <$_> in jsx expands to $_({varmap}, "string", ...), so kw-index is 2.
	_BABEL_KW['$_'] = (2,) # used in .jsx files as <$_>

	def version(self):
	"""Update every time build() changes in a way that affects output."""
	return 1

	def extract_nltext(self, file_contents):
	# Extract the messages from the JavaScript file with the
	# appropriate start and end positions as well. As a small
	# efficiency short-cut, we can say that if none of ($._,
	# $.ngettext, $_) are present in file_contents, there won't be
	# any translations to do.
	if not self._JS_GETTEXT_RE.search(file_contents):
	return []
	else:
	# Convert it to a StringIO buffer for pybabel to handle.
	# pybabel expects utf-8 encoded input, so that's what we give it.
	input = cStringIO.StringIO(file_contents.encode('utf-8'))
	r = third_party.babel.messages.extract.extract_javascript(
	input, self._BABEL_KW, [], {'messages_only': True})
	return list(r) # convert from iterator if need be

	def translate_to_lang(self, babel_output, file_contents, lang, mo_entries):
	# Keep track of if the translated file differs from the original.
	has_diff = False

	# Go through all of the matched messages in reverse (to avoid
	# having to deal with the changes in position of the messages).
	for (messages, start, end) in reversed(babel_output):
	# Figure out the lookup key. extract_javascript returns a
	# list if a plural is found.
	key = messages
	if isinstance(key, basestring): # singular
	key = messages
	translation = mo_entries.get_singular_translation(key)
	if not translation:
	continue
	insert_text = json.dumps(translation)
	elif messages[0] is None: # jsx-style $_()
	# For the jsx $_() operator, the gettext string is the
	# second argument (the first argument is the value-dict).
	key = messages[1]
	translation = mo_entries.get_singular_translation(key)
	if not translation:
	continue
	insert_text = json.dumps(translation)
	else: # plural
	key = messages[0]
	translation_dict = mo_entries.get_plural_translation(key)
	if not translation_dict:
	continue
	# We need the messages to be sorted by index.
	index_and_messages = sorted(translation_dict.iteritems())
	messages = [message for (_, message) in index_and_messages]
	# We store the info we need in legal javascript format.
	insert_text = json.dumps({"lang": lang, "messages": messages})

	# Insert the string at the right position.
	file_contents = ''.join((file_contents[:start],
	insert_text,
	file_contents[end:]))

	# A change was made to the file.
	has_diff = True

	if has_diff:
	return file_contents
	else:
	return None # signals that output == input

	def translate(self, infile_name, outfile_lang_moentries_context):
	if intl.english_only.should_not_translate(infile_name):
	# If we shouldn't translate it, we can just symlink it!
	for (outfile, _, _, _) in outfile_lang_moentries_context:
	self._write_output(infile_name, outfile, None)
	return

	file_contents = self._read_input(infile_name)

	babel_output = self.extract_nltext(file_contents)

	for (outfile, lang, mo_entries, _) in outfile_lang_moentries_context:
	# Get the translation, or None if output == input.
	translated_contents = self.translate_to_lang(
	babel_output, file_contents, lang, mo_entries)

	self._write_output(infile_name, outfile, translated_contents)


	# These rules are only used in dev (where we don't compress js), and
	# for js worker files, which we likewise translate without compressing.
	compile_rule.register_compile(
	'TRANSLATED RAW JS FILES',
	'genfiles/translations/{lang}/{{path}}.js',
	['{{path}}.js',
	'genfiles/extracted_strings/{lang}/{{path}}.js.small_mo.pickle'],
	TranslateJavascript(),
	# small_mo.pickle files are recreatd every time {lang}.po files
	# change, but their contents usually don't change, so crc's are
	# good for us.
	compute_crc=True)


	# This catches files that are compiled (or transpiled) into js.
	dirs_with_js = ('genfiles/compiled_{type}',
	# This is a special case (bundle.js has its own directory)
	'genfiles/khan-exercises',
	)
	for d in dirs_with_js:
	translate_util.register_translatesafe_compile(
	'TRANSLATED JS FILES (%s)' % d,
	'%s/{lang}/{{path}}.js' % d,
	['%s/en/{{path}}.js' % d,
	('genfiles/extracted_strings/{lang}/'
	'%s/{lang}/{{path}}.js.small_mo.pickle' % d)],
	TranslateJavascript(),
	compute_crc=True)


	# This is the rule used in prod, where we only translate javascript
	# after it's been compressed. The exception is handlebars files,
	# which are translated before they're even converted to javascript, in
	# compile_handlebars.py. Luckily, the special-case rule for the
	# handlebars files (in compress_js.py) has higher precedence than this
	# rule, so we can be fully general here.
	# We use 'trumped_by' to make sure this rule doesn't apply when lang=en,
	# and also to make sure this rule doesn't apply when translating handlebars.
	translate_util.register_translatesafe_compile(
	'TRANSLATED COMPRESSED JS FILES',
	'genfiles/compressed_javascript/{lang}/{{path}}.min.js',
	['genfiles/compressed_javascript/en/{{path}}.min.js',
	('genfiles/extracted_strings/{lang}/genfiles/compressed_javascript/{lang}'
	'/{{path}}.min.js.small_mo.pickle')],
	TranslateJavascript(),
	trumped_by=['COMPRESSED JS', 'COMPRESSED TRANSLATED HANDLEBARS JS'],
	compute_crc=True,
	)