#!/usr/bin/env python2
"""Extract schema objects from a postgresql schema-only dump, and store them in a git repository."""
from __future__ import print_function
import os
import re
import hashlib
import subprocess
import argparse
import json
import sys
import shutil
import multiprocessing


MY_NAME = 'pg_schematrack.py'
FILENAME_MAXLEN = 255
INFO_FILE = '__' + MY_NAME + '__info.json'


class ArchiveRecord:
    """Trivial class representing an archive TOC line"""

    def __init__(self, recnum, id2, id3, sig, tocfile_data):
        hasher = hashlib.new('md5')
        self.recid = recnum
        self.id2 = id2
        self.id3 = id3
        self.sig = sig
        self.tocfile_data = tocfile_data
        hasher.update(sig)
        self.md5sig = str(hasher.hexdigest())

    def filename_from_sig(self):
        """Generate a usable filename from an object's TOC line"""
        filename = self.sig
        filename = re.sub(r'\+', '+++', filename)
        filename = re.sub(r'[/ ]', '+', filename)
        if len(filename) > FILENAME_MAXLEN:
            # This should prevent prefix collisions
            filename = filename[:(FILENAME_MAXLEN - 9)] + '~' + self.md5sig[:8]
            print("%s: WARNING: Truncating filename for object '%s' to '%s'"
                  % (MY_NAME, self.sig, filename), file=sys.stderr)
        return filename

    def tocline(self):
        """Reproduce the original TOC line from the internal representation"""
        return str("%d; %d %d %s" % (self.recid, self.id2, self.id3, self.sig))

    def __repr__(self):
        return self.md5sig

    def __str__(self):
        return str("%s(%s) %d %d %d %s %s" %
                   (self.tocfile_data['name'], tocfile_data['md5'],
                    self.recid, self.id2, self.id3, self.md5sig, self.sig))


def extract_from_archive(work):
    """Use `pg_restore` on a single TOC entry to extract the associated SQL,
    trim comments, and persist that into a object-specific file"""
    archive_file, workdir, record, outfile = work
    os.chdir(workdir)
    try:
        cmdline = ["pg_restore", "-L", "/dev/stdin", archive_file]
        pgrestore = subprocess.Popen(cmdline, stdout=subprocess.PIPE, stdin=subprocess.PIPE)
        pgrestore.stdin.write(record)
        pgrestore.stdin.flush()
        pgrestore.stdin.close()
        res = pgrestore.stdout.read()
        # Strip SQL DDL comments and some whitespace
        res = re.sub(r'^---?($| .*)', '', res, flags=re.M)
        res = re.sub(r'(^$\n)+', '\n', res, flags=re.M)
        pgrestore.wait()
        if os.path.exists(outfile):
            # pg_dump from postgresql 9.4 doesn't necessarily include relation
            # names in various TOC entry signatures
            print("%s: WARNING: Object name collision for file '%s' - please upgrade `pg_dump`!"
                  % (MY_NAME, outfile), file=sys.stderr)
        out = open(outfile, "wb")
        out.write(res)
        out.flush()
        out.close()
        return True
    except Exception as e:
        print("ERROR: Could not dump object '%s': %s" % (record, e), file=sys.stderr)
        return False


class GitDirValidator(argparse.Action):
    """Let `git` check if we're dealing with an initialized repository"""
    def __call__(self, parser, namespace, values, option_string=None):
        if isinstance(values, list):
            raise ValueError("cannot have more than one git repository directory")
        elif isinstance(values, str):
            try:
                subprocess.check_output(["git", "-C", values, "status"])
            except subprocess.CalledProcessError:
                raise ValueError("'%s' does not look like a git repository" % values)
        setattr(namespace, self.dest, values)


if __name__ == "__main__":
    P = argparse.ArgumentParser(prog=MY_NAME)
    P.add_argument("-r", "--repo-dir", type=str, required=True,
                   action=GitDirValidator,
                   help="target git repository directory")
    P.add_argument("-s", "--store-dump-verbatim", required=False,
                   action="store_true",
                   help="also store/commit 'pg_dump_file' verbatim")
    P.add_argument("-D", "--store-in-subdir", required=False,
                   action="store_true",
                   help="store files in a 'pg_dump_file'-derived subdirectory")
    P.add_argument("-i", "--no-info-file", required=False, default=False,
                   action="store_true",
                   help='skip writing "info"-file with pg_dump_file metadata')
    P.add_argument("-n", "--no-auto-commit", required=False, default=False,
                   action="store_true",
                   help="do not `git commit` automatically")
    P.add_argument("-m", "--commit-message", type=str, required=False,
                   help="set commit message")
    P.add_argument("--forks", type=int, required=False,
                   default=int(multiprocessing.cpu_count() * 2),
                   help="number of worker processes (capped to 128)")
    P.add_argument("pg_dump_file", help="input file, generated by pg_dump")
    args = P.parse_args()

    OUTPUT_DIR = os.path.abspath(args.repo_dir)

    worker_pool = multiprocessing.Pool(processes=min(128, args.forks))

    pg_dump_file = os.path.abspath(args.pg_dump_file)
    tocfile_stat = os.stat(pg_dump_file)
    tocfile_data = dict()
    tocfile_data['path'] = os.path.basename(pg_dump_file)
    tocfile_data['mtime'] = int(tocfile_stat.st_mtime)
    tocfile_data['size'] = int(tocfile_stat.st_size)

    try:
        tocfile_data['git_info'] = subprocess.check_output(["git", "--version"]).strip()
    except OSError as e:
        print("%s: FATAL: cannot execute `git`: %s" % (MY_NAME, e), file=sys.stderr)
        sys.exit(1)

    try:
        tocfile_data['pg_restore_info'] = subprocess.check_output(["pg_restore", "-V"]).strip()
    except OSError as e:
        print("%s: FATAL: cannot execute `pg_restore`: %s" % (MY_NAME, e), file=sys.stderr)
        sys.exit(1)

    hasher = hashlib.new('md5')
    generated_toc = subprocess.check_output(["pg_restore", "-l", pg_dump_file])
    hasher.update(generated_toc)
    tocfile_data['md5'] = hasher.hexdigest()

    toc_re = r'^([0-9]+); ([0-9]+) ([0-9]+) (.*)'

    records = []

    for tocline in generated_toc.splitlines():
        match = re.search(toc_re, tocline)
        if match:
            hasher = hashlib.new('md5')
            recnum = int(match.group(1))
            _id2 = int(match.group(2))
            _id3 = int(match.group(3))
            sig = str(match.group(4))
            hasher.update(sig)
            sighash = str(hasher.hexdigest())
            x = ArchiveRecord(recnum, _id2, _id3, sig, tocfile_data)
            records.append(x)

    wipe_dir = OUTPUT_DIR
    if args.store_in_subdir:
        wipe_dir = os.path.join(OUTPUT_DIR, tocfile_data['path'])
    if os.path.isdir(wipe_dir):
        for f in os.listdir(wipe_dir):
            fpath = os.path.join(wipe_dir, f)
            if os.path.isfile(fpath):
                os.unlink(fpath)

    os.chdir(OUTPUT_DIR)
    if args.store_in_subdir:
        if not os.path.isdir(tocfile_data['path']):
            os.mkdir(tocfile_data['path'])
        os.chdir(tocfile_data['path'])

    jobs = []
    for r in records:
        jobs.append((pg_dump_file, os.getcwd(), r.tocline(), r.filename_from_sig()))
    worker_pool.map(extract_from_archive, jobs)

    if not args.no_info_file:
        with open(INFO_FILE, "w") as f:
            f.write(json.dumps(tocfile_data, sort_keys=True, indent=2))
            f.write('\n')
            f.flush()
            f.close()

    os.chdir(OUTPUT_DIR)
    verbatim_dir = 'pg_dump_verbatim/'
    if args.store_dump_verbatim:
        if not os.path.isdir(verbatim_dir):
            os.mkdir(verbatim_dir)
        with open(os.path.join(verbatim_dir, ".gitattributes"), "w") as f:
            f.write("* binary\n")
            f.flush()
            f.close()
        shutil.copy2(pg_dump_file, os.path.join(verbatim_dir, tocfile_data['path']))

    if args.commit_message:
        commit_message = args.commit_message
    else:
        commit_message = "%s: %s" % (MY_NAME, tocfile_data['path'])

    git_changes = subprocess.check_output(["git", "status", "--porcelain"])
    if not args.no_auto_commit:
        if git_changes != '':
            subprocess.check_call(["git", "-c", "user.name=" + MY_NAME,
                                   "-c", "user.email=<>", "add", "."])
            subprocess.check_call(["git", "-c", "user.name=" + MY_NAME,
                                   "-c", "user.email=<>", "commit",
                                   "-m", commit_message])
        else:
            print("%s: No changes to record." % MY_NAME)
    else:
        print("%s: Was requested not to commit any changes." % MY_NAME)
