#!/usr/bin/env python3

"""Simply turn all input files to html. No errorchecking, so keep backups. It uses the mediawiki webapi, so you need to be online.

Usage: ./parse_wikipedia_files_to_html.py <files>

Copyright: 2010 © Arne Babenhauserheide
License: You can use this under the GPLv3 or later, if you add the appropriate license files → http://gnu.org/licenses/gpl.html
"""

from urllib.request import urlopen
from urllib.parse import quote
from urllib.error import HTTPError, URLError
from time import sleep
from random import random
from yaml import load
from sys import argv

mediawiki_files = argv[1:]

def wikitext_to_html(text):
    """parse text in mediawiki markup to html."""
    url = "http://en.wikipedia.org/w/api.php?action=parse&format=yaml&text=" + quote(text, safe="") + " "
    f = urlopen(url)
    y = f.read()
    f.close()
    text = load(y)["parse"]["text"]["*"]
    return text

for mf in mediawiki_files:
    with open(mf) as f:
        text = f.read()
    HTML_HEADER = "<html><head><title>" + mf + "</title></head><body>"
    HTML_FOOTER = "</body></html>"
    try: 
        text = wikitext_to_html(text)
        with open(mf, "w") as f:
            f.write(HTML_HEADER)
            f.write(text)
            f.write(HTML_FOOTER)
    except HTTPError:
        print("Error converting file", mf)
    except URLError:
        print("Server doesn’t like us :(", mf)
        sleep(10*random())
    # add a random wait, so the api server doesn’t kick us
    sleep(3*random())