youtube-dl-gui/devscripts/check-translation.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Author: Sotiris Papadopoulos <ytubedlg@gmail.com>
Last-Revision: 2017-04-19

Script to automatically check PO files

"""

from __future__ import unicode_literals

import os
import sys
import logging
import argparse

from time import sleep
from datetime import datetime, timedelta, tzinfo

try:
    import polib
    import google_translate
except ImportError as error:
    print(error)
    sys.exit(1)


WTIME = 2.0  # Time in seconds to wait between requests to avoid ban

PACKAGE = "youtube_dl_gui"

PO_FILENAME = "{}.po".format(PACKAGE)

LOCALE_PATH_TMPL = os.path.join(PACKAGE, "locale", "{lang}", "LC_MESSAGES", PO_FILENAME)


logging.basicConfig(level=logging.ERROR)


def parse():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description="Script to automatically check PO files")

    parser.add_argument("language", help="language of the PO file to check")

    parser.add_argument("-w", "--werror", action="store_true", help="treat all warning messages as errors")
    parser.add_argument("-o", "--only-headers", action="store_true", help="check only the PO file headers")
    parser.add_argument("-n", "--no-translate", action="store_true", help="do not use the translator to check 'msgstr' fields")
    parser.add_argument("-t", "--tlang", help="force a different language on the translator than the one given")

    return parser.parse_args()


class UTC_Offset_Timezone(tzinfo):

    """Class that represents a UTC offset in the format +/-0000."""

    def __init__(self, offset_string):
        self.offset = timedelta(seconds=UTC_Offset_Timezone.parse_offset(offset_string))

    def utcoffset(self, dt):
        return self.offset + self.dst(dt)

    def dst(self, dt):
        return timedelta(0)

    @staticmethod
    def parse_offset(offset_string):
        """Parse the offset string into seconds."""

        if len(offset_string) != 5:
            raise ValueError("Invalid length for offset string ({})".format(offset_string))

        hours = offset_string[1:3]
        minutes = offset_string[3:5]

        offset = int(hours) * 3600 + int(minutes) * 60

        if offset_string[0] == "-":
            return -1 * offset

        return offset


def parse_date(date_string):
    """Parse date string into an aware datetime object."""

    # Just a small list with the most common timezones
    offset_list = [
        ("EEST", "0300"),
        ("EET", "0200"),
        ("GMT", "0000"),
        ("UTC", "0000")
    ]

    # Replace all the timezones with the offset
    for item in offset_list:
        timezone, offset = item

        date_string = date_string.replace(timezone, offset)

    datetime_string = date_string[:16]
    offset_string = date_string[16:]

    naive_date = datetime.strptime(datetime_string, "%Y-%m-%d %H:%M")

    # Create & return an aware datetime object based on the offset
    return naive_date.replace(tzinfo=UTC_Offset_Timezone(offset_string))

# Print helpers

def my_print(msg, char="*", value=None, exit=False):
    """Print 'msg', debug 'value' and exit if 'exit' is True."""
    print("[{}] {}".format(char, msg))

    if value is not None:
        print("\tvalue= \"{}\"".format(value))

    if exit:
        sys.exit(1)

def perror(msg, value=None):
    my_print(msg, "-", value, True)

def pwarn(msg, value=None, exit=False):
    my_print(msg, "!", value, exit)

def pinfo(msg):
    my_print(msg)

#############################


def main(args):
    os.chdir("..")

    # setup
    pot_file_path = LOCALE_PATH_TMPL.format(lang="en_US")
    po_file_path = LOCALE_PATH_TMPL.format(lang=args.language)

    if not os.path.exists(pot_file_path):
        perror("Failed to locate POT file, exiting...", pot_file_path)

    if not os.path.exists(po_file_path):
        perror("Failed to locate PO file, exiting...", po_file_path)

    pot_file = polib.pofile(pot_file_path)
    po_file = polib.pofile(po_file_path)

    # check headers
    pinfo("Checking PO headers")

    pot_headers = pot_file.metadata
    po_headers = po_file.metadata

    if pot_headers["Project-Id-Version"] != po_headers["Project-Id-Version"]:
        pwarn("'Project-Id-Version' headers do not match", exit=args.werror)

    if pot_headers["POT-Creation-Date"] != po_headers["POT-Creation-Date"]:
        pwarn("'POT-Creation-Date' headers do not match", exit=args.werror)

    po_creation_date = parse_date(po_headers["POT-Creation-Date"])
    po_revision_date = parse_date(po_headers["PO-Revision-Date"])

    # Aware datetimes convert to UTC automatically when comparing
    if po_revision_date <= po_creation_date:
        pwarn("PO file seems outdated", exit=args.werror)

    if "Language" in po_headers and po_headers["Language"] != args.language:
        pwarn("'Language' header does not match with the given language", po_headers["Language"], args.werror)

    pinfo("Last-Translator: {}".format(po_headers["Last-Translator"]))

    # check translations
    if args.only_headers:
        sys.exit(0)

    pinfo("Checking translations, this might take a while...")

    pot_msgid = [entry.msgid for entry in pot_file]
    po_msgid = [entry.msgid for entry in po_file]

    # lists to hold reports
    missing_msgid = []
    not_translated = []
    same_msgstr = []
    with_typo = []
    verify_trans = []
    fuzzy_trans = po_file.fuzzy_entries()

    for msgid in pot_msgid:
        if msgid not in po_msgid:
            missing_msgid.append(msgid)

    # Init translator only if the '--no-translate' flag is NOT set
    translator = None
    if not args.no_translate:
        eta = timedelta(seconds=len(pot_file) * WTIME)
        pinfo("Approximate time to check translations online: {}".format(eta))

        translator = google_translate.GoogleTranslator(timeout=5.0, retries=2, wait_time=WTIME)

        # Set source language for GoogleTranslator
        if args.tlang is not None:
            src_lang = args.tlang
            pinfo("Forcing '{}' as the translator's source language".format(src_lang))
        else:
            # Get a valid source language for Google
            # for example convert 'ar_SA' to 'ar' or 'zh_CN' to 'zh-CN'
            src_lang = args.language

            if src_lang not in translator._lang_dict:
                src_lang = src_lang.replace("_", "-")

                if src_lang not in translator._lang_dict:
                    src_lang = src_lang.split("-")[0]

    # Keep entries that need further analysis using the translator
    further_analysis = []

    for entry in po_file:
        if not entry.translated():
            not_translated.append(entry)

        elif entry.msgid == entry.msgstr:
            same_msgstr.append(entry)

        else:
            further_analysis.append(entry)

    if translator is not None:
        # Pass translations as a list since GoogleTranslator can handle them
        words_dict = translator.get_info_dict([entry.msgstr for entry in further_analysis], "en", src_lang)

        for index, word_dict in enumerate(words_dict):
            # Get the corresponding POEntry since the words_dict does not contain those
            entry = further_analysis[index]

            if word_dict is not None:
                if word_dict["has_typo"]:
                    with_typo.append(entry)

                if word_dict["translation"].lower() != entry.msgid.lower():

                    found = False

                    # Check verbs, nouns, adverbs, etc..
                    for key in word_dict["extra"]:
                        if entry.msgid.lower() in word_dict["extra"][key].keys():
                            found = True
                            break

                    if not found:
                        verify_trans.append((entry, word_dict["translation"]))

    # time to report
    print("=" * 25 + "Report" + "=" * 25)

    if missing_msgid:
        print("Missing msgids")

        for msgid in missing_msgid:
            print("  \"{}\"".format(msgid))

    if not_translated:
        print("Not translated")

        for entry in not_translated:
            print("  line: {} msgid: \"{}\"".format(entry.linenum, entry.msgid))

    if same_msgstr:
        print("Same msgstr")

        for entry in same_msgstr:
            print("  line: {} msgid: \"{}\"".format(entry.linenum, entry.msgid))

    if with_typo:
        print("With typo")

        for entry in with_typo:
            print("  line: {} msgid: \"{}\" msgstr: \"{}\"".format(entry.linenum, entry.msgid, entry.msgstr))

    if verify_trans:
        print("Verify translation")

        for item in verify_trans:
            entry, translation = item
            print("  line: {} msgid: \"{}\" trans: \"{}\"".format(entry.linenum, entry.msgid, translation))

    if fuzzy_trans:
        print("Fuzzy translations")

        for entry in fuzzy_trans:
            print("  line: {} msgid: \"{}\"".format(entry.linenum, entry.msgid))

    total = len(missing_msgid) + len(not_translated) + len(same_msgstr) + len(with_typo) + len(verify_trans) + len(fuzzy_trans)

    print("")
    print("Missing msgids\t\t: {}".format(len(missing_msgid)))
    print("Not translated\t\t: {}".format(len(not_translated)))
    print("Same msgstr\t\t: {}".format(len(same_msgstr)))
    print("With typo\t\t: {}".format(len(with_typo)))
    print("Verify translation\t: {}".format(len(verify_trans)))
    print("Fuzzy translations\t: {}".format(len(fuzzy_trans)))
    print("Total\t\t\t: {}".format(total))


if __name__ == "__main__":
    try:
        main(parse())
    except KeyboardInterrupt:
        print("KeyboardInterrupt")