#!/usr/bin/env python3
#
# Read the specified HTML file and generate the JSON file.
#
# Copyright © 2016 Dr. Tobias Quathamer <toddy@debian.org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

import json
import re


# wget -O tmp.html http://www.loc.gov/standards/iso639-5/id.php
iso = {"639-5": []}
with open("tmp.html") as html:
    for line in html:
        match = re.search(r'<td scope="row">([a-z]+)</td>', line)
        if match:
            item = {"alpha_3": match.group(1)}
            # The next line contains the english name
            line = html.readline()
            match = re.search(r'<td>([^<]+)</td>', line)
            item["name"] = match.group(1)
            iso["639-5"].append(item)

# Sort by alpha_3
iso["639-5"].sort(key=lambda item: item["alpha_3"])

# Write json
with open("data/iso_639-5.json", "w") as out:
    json.dump(iso, out, ensure_ascii=False, indent=2, sort_keys=True)
    # Add a final newline
    out.write("\n")





# wget -O tmp.txt http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
iso = {"639-2": []}
with open("tmp.txt") as txt:
    for line in txt:
        items = line.split("|")
        item = {
            "alpha_3": items[0],
            "name": items[3].replace("ca.", "ca. ").strip()
        }
        if item["alpha_3"] == "ben":
            item["common_name"] = "Bangla"
        if items[1] is not "":
            item["alpha_3"] = items[1]
            item["bibliographic"] = items[0]
        if items[2] is not "":
            item["alpha_2"] = items[2]
        iso["639-2"].append(item)

# Sort by alpha_3
iso["639-2"].sort(key=lambda item: item["alpha_3"])

# Write json
with open("data/iso_639-2.json", "w") as out:
    json.dump(iso, out, ensure_ascii=False, indent=2, sort_keys=True)
    # Add a final newline
    out.write("\n")




# wget -O tmp.tab http://www-01.sil.org/iso639-3/iso-639-3.tab
# wget -O tmp-inverted.tab http://www-01.sil.org/iso639-3/iso-639-3_Name_Index.tab
inverted_name = {}
with open("tmp-inverted.tab") as txt:
    # Discard the first line containing the header
    txt.readline()
    for line in txt:
        items = line.strip().split("\t")
        if items[1] != items[2]:
            if items[0] not in inverted_name:
                inverted_name[items[0]] = {}
            inverted_name[items[0]][items[1]] = items[2]

iso = {"639-3": []}
with open("tmp.tab") as txt:
    # Discard the first line containing the header
    txt.readline()
    for line in txt:
        items = line.split("\t")
        item = {
            "alpha_3": items[0],
            "name": items[6].strip(),
            "scope": items[4],
            "type": items[5],
        }
        if item["alpha_3"] == "ben":
            item["common_name"] = "Bangla"
        if item["alpha_3"] in inverted_name and item["name"] in inverted_name[item["alpha_3"]]:
            item["inverted_name"] = inverted_name[item["alpha_3"]][item["name"]]
        if items[1] != items[2]:
            item["bibliographic"] = items[1]
        if items[3] is not "":
            item["alpha_2"] = items[3]
        iso["639-3"].append(item)

# Sort by alpha_3
iso["639-3"].sort(key=lambda item: item["alpha_3"])

# Write json
with open("data/iso_639-3.json", "w") as out:
    json.dump(iso, out, ensure_ascii=False, indent=2, sort_keys=True)
    # Add a final newline
    out.write("\n")




# wget -O tmp.xml http://www.currency-iso.org/dam/downloads/lists/list_one.xml
iso = {"4217": []}
codes_seen = []
with open("tmp.xml") as xml:
    for line in xml:
        item = {}
        match = re.search(r'<CcyNm>([^<]+)</CcyNm>', line)
        if match:
            item["name"] = match.group(1)
            # The next line contains the alpha_3
            line = xml.readline()
            match = re.search(r'<Ccy>([^<]+)</Ccy>', line)
            if not match:
                continue
            item["alpha_3"] = match.group(1)
            # The next line contains the numeric
            line = xml.readline()
            match = re.search(r'<CcyNbr>([^<]+)</CcyNbr>', line)
            item["numeric"] = match.group(1)
            if item["alpha_3"] not in codes_seen:
                iso["4217"].append(item)
                codes_seen.append(item["alpha_3"])

# Sort by alpha_3
iso["4217"].sort(key=lambda item: item["alpha_3"])

# Write json
with open("data/iso_4217.json", "w") as out:
    json.dump(iso, out, ensure_ascii=False, indent=2, sort_keys=True)
    # Add a final newline
    out.write("\n")





# wget -O tmp.xml http://www.unicode.org/iso15924/iso15924.txt.zip
iso = {"15924": []}
codes_seen = []
with open("iso15924-utf8-20160119.txt") as xml:
    for line in xml:
        item = {}
        items = line.split(";")
        item = {
            "alpha_4": items[0],
            "numeric": items[1],
            "name": items[2]
        }
        iso["15924"].append(item)

# Sort by alpha_4
iso["15924"].sort(key=lambda item: item["alpha_4"])

# Write json
with open("data/iso_15924.json", "w") as out:
    json.dump(iso, out, ensure_ascii=False, indent=2, sort_keys=True)
    # Add a final newline
    out.write("\n")
