century-arcade · saulpw · Sep 2, 2016 · Aug 18, 2016 · Aug 19, 2016 · Aug 19, 2016
diff --git a/scripts/05-sql-import-receipts.sh b/scripts/05-sql-import-receipts.sh
@@ -6,5 +6,11 @@ METADB=meta.db
 
 if [ ! -f $METADB ] ; then
     sqlite3 $METADB < ./scripts/meta.sql
-    ./scripts/tsv2sqlite.py ${DEBUG} -o ${METADB} gxd/receipts.tsv
+    ./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Receipt" -o ${METADB} gxd/receipts.tsv
+    ./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Publication" -o ${METADB} gxd/publications.tsv
+    ./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Similar" -o ${METADB} gxd/similar.tsv
+else
+    echo "$METADB already exists"
 fi
+
+
diff --git a/scripts/20-analyze.sh b/scripts/20-analyze.sh
@@ -8,9 +8,10 @@ mkdir -p $PUB
 rm -f $PUB/*
 
 # regenerate pub/puzzles.tsv
+# TODO: should populate puzzles table in sqlite instead
 scripts/21-clean-metadata.py -o $PUB/puzzles.tsv $GXD
 
 # regenerate pub/pubyears.tsv
-scripts/22-pubyears.py
 scripts/25-analyze-puzzle.py -o $WWW/ -c $GXD $GXD
 scripts/26-clues-tsv.py -c $GXD -o $PUB/
+scripts/27-pubyear-stats.py -c ${GXD}
diff --git a/scripts/21b-clean-metadata.py b/scripts/21b-clean-metadata.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+
+# Usage: $0 [-o <puzzles.tsv>] <input>
+#
+#   Generates puzzles.tsv with cleaned metadata for each .xd in <input>.
+#
+
+from xdfile import utils, metasql as metadb
+import xdfile
+import re
+
+
+CLEAN_SUFFIX = '_clean'
+
+
+def find_date(s):
+    m = re.search(r"\s*(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)?\s*(\d{1,2})?,?\s*\d{4},?\s*", s, flags=re.IGNORECASE)
+    if m:
+        return m.group(0)
+
+    m = re.search(r"\d{2}[/\-]?\d{2}[/\-]?\d{2,4}", s)
+    if m:
+        return m.group(0)
+
+    return ""
+
+
+def boil_copyright(copyright, author):
+    import re
+    if author:
+        copyright = copyright.replace(author, "")
+
+    # and remove textual date
+    dt = find_date(copyright)
+    if dt:
+        copyright = copyright.replace(dt, " ")
+
+#    copyright = copyright.replace(u"©", "(c)")
+
+    return copyright
+
+
+# also editor
+def clean_author(author, editor):
+    if author:
+        r = r'(?i)(?:(?:By )*(.+)(?:[;/,-]|and) *)?(?:edited|Editor|(?<!\w)Ed[.])(?: By)*(.*)'
+        m = re.search(r, author)
+        if m:
+            author, editor = m.groups()
+
+        if author:
+            while author.lower().startswith("by "):
+                author = author[3:]
+
+            while author[-1] in ",.":
+                author = author[:-1]
+        else:
+            author = ""
+
+        if " / " in author:
+            if not editor:
+                author, editor = author.rsplit(" / ", 1)
+
+    if editor:
+        while editor.lower().startswith("by "):
+            editor = editor[3:]
+
+        while editor[-1] in ",.":
+            editor = editor[:-1]
+
+    author = author.strip()
+    editor = editor.strip()
+    return author, editor
+
+
+def clean_title(title):
+    if title.endswith(']'):
+        title = title[:title.rfind('[')]
+
+    # title is only between the double-quotes for some USAToday
+    if title.startswith("USA Today"):
+        if title and title[-1] == '"':
+            title = title[title.index('"') + 1:-1]
+            if title[-1] == ",":
+                title = title[:-1]
+        elif title and title[0] == '"':
+            title = title[1:title.rindex('"')]
+
+    return title
+
+
+def clean_headers(xd):
+    # remove known unwanted header fields, log unknown headers
+    for hdr in list(xd.headers.keys()):
+        if hdr in ["Source", "Identifier", "Acquired", "Issued", "Category"]:
+            xd.set_header(hdr, None)
+        else:
+            if hdr.lower() not in xdfile.HEADER_ORDER:
+                utils.warn("%s: '%s' header not known: '%s'" % (xd.filename, hdr, xd.headers[hdr]))
+
+    # clean Author and Editor headers
+    author = xd.get_header("Author") or ""
+    if not author:
+        if xd.get_header("Creator"):
+            assert not author
+            author = xd.get_header("Creator")
+            xd.set_header("Creator", None)
+
+    editor = xd.get_header("Editor") or ""
+
+    newauthor, neweditor = clean_author(author, editor)
+
+    if newauthor != author:
+        xd.set_header("Author" + CLEAN_SUFFIX, newauthor)
+
+    if neweditor != editor:
+        xd.set_header("Editor" + CLEAN_SUFFIX, neweditor)
+
+    # clean Title header
+    title = xd.get_header("Title") or ""
+    newtitle = clean_title(title)
+
+    if newtitle != title:
+        xd.set_header("Title" + CLEAN_SUFFIX, newtitle)
+    # create Date header
+    dt = xd.get_header("Date")
+
+    ## try getting Date from filename
+    if not dt:
+        try:
+            d = utils.parse_date_from_filename(xd.filename)
+            if d:
+                dt = d.strftime("%Y-%m-%d")
+        except Exception as e:
+            utils.error(str(e))
+            if args.debug:
+                raise
+
+    ## try getting Date from copyright
+    if not dt:
+        rights = xd.get_header("Copyright") or ""
+        dt = find_date(rights)
+
+    if dt:
+        xd.set_header("Date", dt)
+
+
+
+def main():
+    args = utils.get_args(desc='outputs cleaned puzzle metadata rows')
+
+    xd_list = []
+    # Prepaire array first
+    for input_source in args.inputs:
+        for fn, contents in utils.find_files(input_source, ext='.xd'):
+            xd = xdfile.xdfile(contents.decode('utf-8'), fn)
+            clean_headers(xd)
+            xd_list.append(xd)
+            # metadb.update_puzzles_row(xd)
+    # Put all into sql
+    metadb.append_puzzles(xd_list)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/22-pubyears.py b/scripts/22-pubyears.py
diff --git a/scripts/27-pubyear-stats.py b/scripts/27-pubyear-stats.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+
+import json
+import re
+from collections import defaultdict, Counter
+
+from xdfile.utils import error, debug, info
+from xdfile import utils, metasql, metadatabase as metadb
+from xdfile import year_from_date, dow_from_date
+import xdfile
+
+
+
+def main():
+    args = utils.get_args('generate pub-years data')
+    outf = utils.open_output()
+
+    weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ]
+
+    pubyears = {} # set()
+    for xd in xdfile.corpus():
+        puby = (xd.publication_id(), xd.year())
+        if puby not in pubyears:
+            pubyears[puby] = []
+        pubyears[puby].append(xd)
+
+    if pubyears:
+        metasql.execute("DELETE FROM stats;")
+
+    for puby, xdlist in sorted(pubyears.items()):
+        pubid, year = puby
+        npublic = 0
+
+        # TODO: SELECT FROM publications
+        nexisting = 0
+
+        # organize by day-of-week
+        byweekday = {}
+        byweekday_similar = {}
+        for w in weekdays:
+            byweekday[w] = []
+            byweekday_similar[w] = []
+
+        for xd in xdlist:
+            dow = dow_from_date(xd.get_header('Date'))
+            if dow: # Might be empty date or only a year
+                byweekday[dow].append(xd)
+
+        for r in metasql.select("SELECT * FROM similar_grids WHERE xdid LIKE '{}%' AND GridMatchPct > 25".format(pubid + str(year))):
+            xd = xdfile.get_xd(r['xdid'])
+            if xd:
+                dt = xd.get_header('Date')
+                if dt:
+                    assert dt
+                    dow = dow_from_date(dt)
+                    if dow: # Might be empty date or only a year
+                        byweekday_similar[dow].append(r)
+                else:
+                    debug("Date not set for: %s" % xd)
+
+        # tally stats
+        for weekday in weekdays:
+            copyrights = Counter()  # [copyright_text] -> number of xd
+            editors = Counter()  # [editor_name] -> number of xd
+            formats = Counter()  # ["15x15 RS"] -> number of xd
+            # todo
+            nexisting = 0
+
+            nxd = len(byweekday[weekday])
+            public_xdids = [] # Empty for now
+            for xd in byweekday[weekday]:
+                xdid = xd.xdid()
+                if  (year.isdigit() and int(year) <= 1965) or xdid in public_xdids:
+                    npublic += 1
+
+                editor = xd.get_header('Editor').strip()
+                if editor:
+                    editors[editor] += 1
+
+                sizestr = xd.sizestr()
+                if sizestr:
+                    formats[sizestr] += 1
+
+                copyright = xd.get_header('Copyright').strip()
+                if copyright:
+                    copyrights[copyright] += 1
+
+            # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats)))
+            def process_counter(count, comp_value):
+                # Process counter comparing with comp_value
+                if count:
+                    item, num  = count.most_common(1)[0]
+                    if num != comp_value:
+                        item += " (%s)" % num
+                else:
+                    item = ''
+                return item
+
+            #
+            maineditor = process_counter(editors, nxd)
+            maincopyright = process_counter(copyrights, nxd)
+            mainformat = process_counter(formats, nxd)
+
+            reprints = 0
+            touchups = 0
+            redones = 0
+            copies = 0
+            themecopies = 0
+            for r in byweekday_similar[weekday]:
+                # debug("Xdid %s Xdidmatch %s" % (r['xdid'], r['xdidMatch']))
+                xd1 = xdfile.get_xd(r['xdid'])
+                xd2 = xdfile.get_xd(r['xdidMatch'])
+                if xd1 is None or xd2 is None:
+                    continue
+                # debug("XD1: %s XD2: %s" % (xd1, xd2))
+                dt1 = xd1.get_header('Date')
+                dt2 = xd2.get_header('Date')
+                aut1 = xd1.get_header('Author')
+                aut2 = xd2.get_header('Author')
+                pct = int(r['GridMatchPct'])
+                if dt2 < dt1:  # only capture the later one
+                    if aut1 == aut2:
+                        if pct == 100:
+                            reprints += 1
+                        elif pct >= 50:
+                            touchups += 1
+                        elif pct >= 30:
+                            themecopies += 1
+                    else: # suspicious
+                        if pct >= 50:
+                            copies += 1
+                        elif pct >= 30:
+                            themecopies += 1
+
+            metasql.execute("INSERT INTO stats VALUES (?,?,?, ?,?,?, ?, ?,?,?, ?,?, ?,?)",
+                (pubid, year, weekday,
+                mainformat, maineditor, maincopyright,
+                nexisting, nxd, npublic,
+                reprints, touchups, redones,
+                copies, themecopies))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/30-mkwww.sh b/scripts/30-mkwww.sh
@@ -15,6 +15,9 @@ cp $PUB/*.tsv $WWW/pub/
 echo -en "${GREEN}Generate /pub/[<pub>][<year>]${NORMAL}\n"
 scripts/31-mkwww-publishers.py $CORPUS -o $WWW/
 
+echo -en "${GREEN}Generate /pub/ index${NORMAL}\n"
+scripts/37-pubyear-svg.py -o $WWW/
+
 echo -en "${GREEN}Generate /pub/word/<ANSWER>${NORMAL}\n"
 scripts/33-mkwww-words.py $CORPUS -o $WWW/