diff --git a/AWS.md b/AWS.md
new file mode 100644
index 0000000..8e707f4
--- /dev/null
+++ b/AWS.md
@@ -0,0 +1,22 @@
+# AWS commands
+
+## Extend root volume of instance
+1) Stop instance
+
+2) Create a snapshot
+aws ec2 create-snapshot --volume-id {cur_root_volume} --description 'Initial snapshot'
+
+3) Create new root volume; get snapshot id from previous output
+aws ec2 create-volume --size {new_size_gb} --region us-west-2 --availability-zone us-west-2a --volume-type standard --snapshot-id {snapshot_id_of_root_volume}
+
+4) Detach existing root volume
+aws ec2 detach-volume --volume-id {cur_root_volume}
+
+5) Attach new root volume
+aws ec2 attach-volume --volume-id {new_root_volume_step3} --instance-id i-5fc17aca --device /dev/sda1
+
+6) Start instance and process with partition extension
+http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-expand-volume.html#recognize-expanded-volume-linux
+
+7) If all ok, delete old root volume
+ws ec2 delete-volume --volume-id {cur_root_volume_step2}
diff --git a/scripts/00-aws-bootstrap.sh b/scripts/00-aws-bootstrap.sh
index 556f598..0d3d501 100755
--- a/scripts/00-aws-bootstrap.sh
+++ b/scripts/00-aws-bootstrap.sh
@@ -52,7 +52,7 @@ echo "Import all .tsv to sql"
 scripts/05-sql-import-receipts.sh
 
 echo "Run deploy script"
-/bin/bash -x scripts/05-full-pipeline.sh
+/bin/bash scripts/05-full-pipeline.sh
 
 echo 'SUMMARY: End time '`date +'%Y-%m-%d %H:%M'`
 # Parse log to get summary to be mailed
diff --git a/scripts/00-aws-ebs-bootstrap.sh b/scripts/00-aws-ebs-bootstrap.sh
new file mode 100755
index 0000000..8130ff1
--- /dev/null
+++ b/scripts/00-aws-ebs-bootstrap.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+set -x
+
+WORKDIR=/tmp
+export SSHHOME=$HOME
+
+if [ -z "$HOME" ] ; then
+    export SSHHOME=$HOME
+    HOME=/tmp
+    # Hack for AWS where HOME not set
+    if [[ $UID -eq '0' ]]; then
+        export SSHHOME=/root
+    fi
+fi
+
+# This script is passed as userdata to the launch-config, which the base AMI
+# executes at the end of initialization.
+
+export LC_ALL="en_US.UTF-8"
+export LOGFILE=/tmp/`date +"%Y-%m-%d"`.log
+export SUMLOGFILE=/tmp/`date +"%Y-%m-%d"`summary.log
+# To run xdfile based scripts below
+export PYTHONPATH=.
+
+exec > >(tee -i ${LOGFILE}) 2>&1
+echo 'SUMMARY: Start time:'`date +'%Y-%m-%d %H:%M'`
+
+# export DEBIAN_FRONTEND=noninteractive
+# sudo apt-get update && \
+#    sudo apt-get install --yes language-pack-en-base zip awscli python3-lxml python3-pip git markdown python3-boto3 sqlite3 && \
+#    sudo pip3 install cssselect botocore
+
+cd $WORKDIR
+# Get config file from AWS
+aws s3 cp --region=us-west-2 s3://xd-private/etc/config $WORKDIR/config
+source $WORKDIR/config
+
+echo "Clone main project repo and switch to branch ${BRANCH}"
+git clone ${XD_GIT}
+cd xd/
+git checkout ${BRANCH}
+# Export all config vars
+source scripts/config-vars.sh
+
+mkdir -p $SSHHOME/.ssh
+echo "Clone GXD repo"
+aws s3 cp --region=us-west-2 s3://xd-private/etc/gxd_rsa $SSHHOME/.ssh/
+chmod 600 $SSHHOME/.ssh/gxd_rsa
+
+cat src/aws/ssh_config >> $SSHHOME/.ssh/config
+ssh-agent bash -c "ssh-add $SSHHOME/.ssh/gxd_rsa; git clone ${GXD_GIT}"
+
+echo "Import all .tsv to sql"
+scripts/05-sql-import-receipts.sh
+
+echo "Run deploy script"
+/bin/bash scripts/05-full-pipeline.sh
+
+echo 'SUMMARY: End time '`date +'%Y-%m-%d %H:%M'`
+# Parse log to get summary to be mailed
+egrep -i 'ERROR|WARNING|SUMMARY' ${LOGFILE} > ${SUMLOGFILE}
+echo -e '\n' >> ${SUMLOGFILE}
+
+echo "Getting summary"
+scripts/48-stats.sh >> ${SUMLOGFILE}
+echo -e '\n' >> ${SUMLOGFILE}
+
+echo "SUMMARY: Full log file http://$BUCKET/logs/`basename ${LOGFILE}`" >> ${SUMLOGFILE}
+
+echo "Sending email"
+scripts/send-email.py $ADMIN_EMAIL "execution logs for $TODAY" ${SUMLOGFILE}
+
+echo "Copy logs to AWS"
+aws s3 cp --region ${REGION} --content-type='text/plain' ${LOGFILE} s3://${BUCKET}/logs/ --acl public-read
+aws s3 cp --region ${REGION} --content-type='text/plain' ${SUMLOGFILE} s3://${BUCKET}/logs/ --acl public-read
+
+echo "Make logs index page"
+scripts/49b-mkindex.sh
diff --git a/scripts/00-aws-ec2-launch-manual-ebs.sh b/scripts/00-aws-ec2-launch-manual-ebs.sh
new file mode 100755
index 0000000..d9dda62
--- /dev/null
+++ b/scripts/00-aws-ec2-launch-manual-ebs.sh
@@ -0,0 +1,58 @@
+#!/bin/bash -x
+#
+# Usage: $0 <config file>
+# see format below
+#
+# export KEY=
+# export BRANCH=
+# export REGION=
+# export AWS_ACCESS_KEY=
+# export AWS_SECRET_KEY=
+# export BUCKET=
+# export EMAIL=
+# export XD_GIT=
+# export GXD_GIT=
+# export XD_PROFILE=
+# export AMI_ID=ami-75fd3b15 #Ubuntu Server 16.04 LTS (HVM)
+# export SSH_SECURITY_GID=sg-e00fbe87 # SSH access
+# export INSTANCE_TYPE=r3.large
+# export QUICKRUN=True # For quickrun scipping 20- and 30- scripts
+#
+#source src/aws/config
+
+aws="aws"
+sh="bash"
+
+XDCONFIG=$1
+if [ -n "$XDCONFIG" ]; then
+    aws s3 cp $XDCONFIG s3://xd-private/etc/config
+    source ${XDCONFIG}
+    # AMIID - 16.04 LTS amd64   hvm:ebs-ssd
+    # https://cloud-images.ubuntu.com/locator/ec2/
+    AMI_ID=ami-9ece19fe
+    INSTANCE_JSON=/tmp/instance.json
+
+    #  created via IAM console: role/xd-scraper
+    $aws ec2 run-instances \
+      --key-name $KEY \
+      --region ${REGION} \
+      --instance-type ${INSTANCE_TYPE} \
+      --block-device-mappings '[{"DeviceName":"/dev/sda1","Ebs":{"DeleteOnTermination":false}}]' \
+      --instance-initiated-shutdown-behavior stop \
+      --iam-instance-profile Arn="$XD_PROFILE" \
+      --user-data file://scripts/00-aws-bootstrap.sh \
+      --image-id ${AMI_ID} > $INSTANCE_JSON
+
+    # Wait a litte before applying sec group
+    sleep 30
+    instance_id=$(cat $INSTANCE_JSON | jq -r .Instances[0].InstanceId)
+    $aws ec2 modify-instance-attribute --groups ${SSH_SECURITY_GID} --instance-id $instance_id
+
+    public_ip=$(aws ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].PublicIpAddress')
+    echo "Connecting: ssh -i ~/*.pem ubuntu@$public_ip" 
+    ssh -i ~/*.pem ubuntu@$public_ip
+
+else
+    echo "Supply config file: $0 <config>"
+    exit 1
+fi
diff --git a/scripts/00-ebs-snapshots.sh b/scripts/00-ebs-snapshots.sh
new file mode 100755
index 0000000..da9697e
--- /dev/null
+++ b/scripts/00-ebs-snapshots.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+#
+# Manage snapshots for EBS storage
+# usage: $0 <instance_id>
+
+instance_id=$1
+
+instance_status=$(aws ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].State')
+volume_id=$(aws ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].BlockDeviceMappings[0].Ebs.VolumeId')
+
+echo "Instance status"
+echo "${instance_status}"
+
+# Get all snapshots for volume
+echo "Snapshots"
+aws ec2 describe-snapshots --filter Name=volume-id,Values=${volume_id}
diff --git a/scripts/00-ebs-start.sh b/scripts/00-ebs-start.sh
new file mode 100755
index 0000000..712ad15
--- /dev/null
+++ b/scripts/00-ebs-start.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Start ebs based instance
+# Usage: $0 <instance_id>
+#
+
+instance_id=$1
+aws ec2 start-instances --instance-ids ${instance_id}
+sleep 10
+
+instance_status=$(aws ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].State')
+
+echo ${instance_status}
+
+public_ip=$(aws ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].PublicIpAddress')
+
+echo "Connect in few seconds: ssh -i ~/*.pem ubuntu@$public_ip"
diff --git a/scripts/05-full-pipeline.sh b/scripts/05-full-pipeline.sh
index b441d85..848a401 100755
--- a/scripts/05-full-pipeline.sh
+++ b/scripts/05-full-pipeline.sh
@@ -34,7 +34,7 @@ if [ ! -n "$QUICKRUN" ]; then
 fi
 
 # commit new puzzles and saved analysis results
-/bin/bash scripts/41-git-commit.sh incoming_$NOW
+/bin/bash scripts/41-git-commit.sh
 
 # capture all logs even if other scripts fail
 scripts/39-mkwww-logs.py -o $WWW/$NOW/index.html $TMP
diff --git a/scripts/05-sql-import-receipts.sh b/scripts/05-sql-import-receipts.sh
index e042226..ed76a71 100755
--- a/scripts/05-sql-import-receipts.sh
+++ b/scripts/05-sql-import-receipts.sh
@@ -6,5 +6,11 @@ METADB=meta.db
 
 if [ ! -f $METADB ] ; then
     sqlite3 $METADB < ./scripts/meta.sql
-    ./scripts/tsv2sqlite.py ${DEBUG} -o ${METADB} gxd/receipts.tsv
+    ./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Receipt" -o ${METADB} gxd/receipts.tsv
+    ./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Publication" -o ${METADB} gxd/publications.tsv
+    ./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Similar" -o ${METADB} gxd/similar.tsv
+else
+    echo "$METADB already exists"
 fi
+
+
diff --git a/scripts/20-analyze.sh b/scripts/20-analyze.sh
index 5c57682..43a9d62 100755
--- a/scripts/20-analyze.sh
+++ b/scripts/20-analyze.sh
@@ -8,9 +8,12 @@ mkdir -p $PUB
 rm -f $PUB/*
 
 # regenerate pub/puzzles.tsv
-scripts/21-clean-metadata.py -o $PUB/puzzles.tsv $GXD
+# TODO: should populate puzzles table in sqlite instead
+scripts/21b-clean-metadata.py $GXD
 
-# regenerate pub/pubyears.tsv
+# generate pubyears just for now TODO: to be replaced
 scripts/22-pubyears.py
+# regenerate pub/pubyears.tsv
 scripts/25-analyze-puzzle.py -o $WWW/ -c $GXD $GXD
 scripts/26-clues-tsv.py -c $GXD -o $PUB/
+scripts/27-pubyear-stats.py -c ${GXD}
diff --git a/scripts/21b-clean-metadata.py b/scripts/21b-clean-metadata.py
new file mode 100755
index 0000000..eb47522
--- /dev/null
+++ b/scripts/21b-clean-metadata.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+
+# Usage: $0 [-o <puzzles.tsv>] <input>
+#
+#   Generates puzzles.tsv with cleaned metadata for each .xd in <input>.
+#
+
+from xdfile import utils, metasql as metadb
+import xdfile
+import re
+
+
+CLEAN_SUFFIX = '_clean'
+
+
+def find_date(s):
+    m = re.search(r"\s*(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)?\s*(\d{1,2})?,?\s*\d{4},?\s*", s, flags=re.IGNORECASE)
+    if m:
+        return m.group(0)
+
+    m = re.search(r"\d{2}[/\-]?\d{2}[/\-]?\d{2,4}", s)
+    if m:
+        return m.group(0)
+
+    return ""
+
+
+def boil_copyright(copyright, author):
+    import re
+    if author:
+        copyright = copyright.replace(author, "")
+
+    # and remove textual date
+    dt = find_date(copyright)
+    if dt:
+        copyright = copyright.replace(dt, " ")
+
+#    copyright = copyright.replace(u"©", "(c)")
+
+    return copyright
+
+
+# also editor
+def clean_author(author, editor):
+    if author:
+        r = r'(?i)(?:(?:By )*(.+)(?:[;/,-]|and) *)?(?:edited|Editor|(?<!\w)Ed[.])(?: By)*(.*)'
+        m = re.search(r, author)
+        if m:
+            author, editor = m.groups()
+
+        if author:
+            while author.lower().startswith("by "):
+                author = author[3:]
+
+            while author[-1] in ",.":
+                author = author[:-1]
+        else:
+            author = ""
+
+        if " / " in author:
+            if not editor:
+                author, editor = author.rsplit(" / ", 1)
+
+    if editor:
+        while editor.lower().startswith("by "):
+            editor = editor[3:]
+
+        while editor[-1] in ",.":
+            editor = editor[:-1]
+
+    author = author.strip()
+    editor = editor.strip()
+    return author, editor
+
+
+def clean_title(title):
+    if title.endswith(']'):
+        title = title[:title.rfind('[')]
+
+    # title is only between the double-quotes for some USAToday
+    if title.startswith("USA Today"):
+        if title and title[-1] == '"':
+            title = title[title.index('"') + 1:-1]
+            if title[-1] == ",":
+                title = title[:-1]
+        elif title and title[0] == '"':
+            title = title[1:title.rindex('"')]
+
+    return title
+
+
+def clean_headers(xd):
+    # remove known unwanted header fields, log unknown headers
+    for hdr in list(xd.headers.keys()):
+        if hdr in ["Source", "Identifier", "Acquired", "Issued", "Category"]:
+            xd.set_header(hdr, None)
+        else:
+            if hdr.lower() not in xdfile.HEADER_ORDER:
+                utils.warn("%s: '%s' header not known: '%s'" % (xd.filename, hdr, xd.headers[hdr]))
+
+    # clean Author and Editor headers
+    author = xd.get_header("Author") or ""
+    if not author:
+        if xd.get_header("Creator"):
+            assert not author
+            author = xd.get_header("Creator")
+            xd.set_header("Creator", None)
+
+    editor = xd.get_header("Editor") or ""
+
+    newauthor, neweditor = clean_author(author, editor)
+
+    if newauthor != author:
+        xd.set_header("Author" + CLEAN_SUFFIX, newauthor)
+
+    if neweditor != editor:
+        xd.set_header("Editor" + CLEAN_SUFFIX, neweditor)
+
+    # clean Title header
+    title = xd.get_header("Title") or ""
+    newtitle = clean_title(title)
+
+    if newtitle != title:
+        xd.set_header("Title" + CLEAN_SUFFIX, newtitle)
+    # create Date header
+    dt = xd.get_header("Date")
+
+    ## try getting Date from filename
+    if not dt:
+        try:
+            d = utils.parse_date_from_filename(xd.filename)
+            if d:
+                dt = d.strftime("%Y-%m-%d")
+        except Exception as e:
+            utils.error(str(e))
+            if args.debug:
+                raise
+
+    ## try getting Date from copyright
+    if not dt:
+        rights = xd.get_header("Copyright") or ""
+        dt = find_date(rights)
+
+    if dt:
+        xd.set_header("Date", dt)
+
+
+
+def main():
+    args = utils.get_args(desc='outputs cleaned puzzle metadata rows')
+
+    xd_list = []
+    # Prepaire array first
+    for input_source in args.inputs:
+        for fn, contents in utils.find_files(input_source, ext='.xd'):
+            xd = xdfile.xdfile(contents.decode('utf-8'), fn)
+            clean_headers(xd)
+            xd_list.append(xd)
+            # metadb.update_puzzles_row(xd)
+    # Put all into sql
+    metadb.append_puzzles(xd_list)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/22-pubyears.py b/scripts/22-pubyears.py
index 0730ee4..26c76b2 100755
--- a/scripts/22-pubyears.py
+++ b/scripts/22-pubyears.py
@@ -3,18 +3,19 @@
 import json
 from collections import defaultdict
 
-from xdfile import utils, metadatabase as metadb
+from xdfile import utils, metadatabase as metadb, metasql
 from xdfile import year_from_date, dow_from_date
 import xdfile
 
 
 def main():
     args = utils.get_args('generate pub-years data')
-
-    pubyears = [ (utils.parse_pubid(r.xdid), year_from_date(r.Date), dow_from_date(r.Date)) 
-					for r in metadb.xd_puzzles().values() ]
     
-    weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ] 
+    puzzles = metasql.select('SELECT * FROM puzzles;')
+
+    pubyears = [ (utils.parse_pubid(r['xdid']), year_from_date(r['Date']), dow_from_date(r['Date']))
+					for r in puzzles]
+    weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ]
     pubs = defaultdict(dict)
     for pubid, year, dow  in pubyears:
         if pubid not in pubs or int(year) not in pubs[pubid]:
@@ -33,7 +34,7 @@ def main():
             for d in weekdays:
                 dow_list.append(str(years_dow[y][d]))
 
-            metadb.append_row('pub/pubyears.tsv', "pubid year total " + " ".join(weekdays), 
+            metadb.append_row('pub/pubyears.tsv', "pubid year total " + " ".join(weekdays),
                     [ pubid, y, sum(years_dow[y].values()), "\t".join(dow_list) ])
 
 if __name__ == "__main__":
diff --git a/scripts/25-analyze-puzzle.py b/scripts/25-analyze-puzzle.py
index f58208f..e31902c 100755
--- a/scripts/25-analyze-puzzle.py
+++ b/scripts/25-analyze-puzzle.py
@@ -16,9 +16,11 @@
 def main():
     p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus")
     p.add_argument('-a', '--all', default=False, help='analyze all puzzles, even those already in similar.tsv')
+    p.add_argument('-l', '--limit', default=100, help='limit amount of puzzles to be analyzed [default=100]')
     args = get_args(parser=p)
     outf = open_output()
 
+    num_processed = 0
     prev_similar = parse_tsv('gxd/similar.tsv', "similar")
     for fn, contents in find_files(*args.inputs, ext=".xd"):
         progress(fn)
@@ -33,6 +35,10 @@ def main():
         similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20),
                                key=lambda x: x[0], reverse=True)
 
+        num_processed += 1
+        if num_processed > int(args.limit):
+            break
+
         if similar_grids:
             info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct))
                                        for pct, xd1, xd2 in similar_grids))
diff --git a/scripts/27-pubyear-stats.py b/scripts/27-pubyear-stats.py
new file mode 100755
index 0000000..526a3da
--- /dev/null
+++ b/scripts/27-pubyear-stats.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+
+import json
+import re
+from collections import defaultdict, Counter
+
+from xdfile.utils import error, debug, info
+from xdfile import utils, metasql, metadatabase as metadb
+from xdfile import year_from_date, dow_from_date
+import xdfile
+
+
+
+def main():
+    args = utils.get_args('generate pub-years data')
+    outf = utils.open_output()
+
+    weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ]
+
+    pubyears = {} # set()
+    for xd in xdfile.corpus():
+        puby = (xd.publication_id(), xd.year())
+        if puby not in pubyears:
+            pubyears[puby] = []
+        pubyears[puby].append(xd)
+
+    if pubyears:
+        metasql.execute("DELETE FROM stats;")
+
+    for puby, xdlist in sorted(pubyears.items()):
+        pubid, year = puby
+        npublic = 0
+
+        # TODO: SELECT FROM publications
+        nexisting = 0
+
+        # organize by day-of-week
+        byweekday = {}
+        byweekday_similar = {}
+        for w in weekdays:
+            byweekday[w] = []
+            byweekday_similar[w] = []
+
+        for xd in xdlist:
+            dow = dow_from_date(xd.get_header('Date'))
+            if dow: # Might be empty date or only a year
+                byweekday[dow].append(xd)
+
+        for r in metasql.select("SELECT * FROM similar_grids WHERE xdid LIKE '{}%' AND GridMatchPct > 25".format(pubid + str(year))):
+            xd = xdfile.get_xd(r['xdid'])
+            if xd:
+                dt = xd.get_header('Date')
+                if dt:
+                    assert dt
+                    dow = dow_from_date(dt)
+                    if dow: # Might be empty date or only a year
+                        byweekday_similar[dow].append(r)
+                else:
+                    debug("Date not set for: %s" % xd)
+
+        # tally stats
+        for weekday in weekdays:
+            copyrights = Counter()  # [copyright_text] -> number of xd
+            editors = Counter()  # [editor_name] -> number of xd
+            formats = Counter()  # ["15x15 RS"] -> number of xd
+            # todo
+            nexisting = 0
+
+            nxd = len(byweekday[weekday])
+            public_xdids = [] # Empty for now
+            for xd in byweekday[weekday]:
+                xdid = xd.xdid()
+                if  (year.isdigit() and int(year) <= 1965) or xdid in public_xdids:
+                    npublic += 1
+
+                editor = xd.get_header('Editor').strip()
+                if editor:
+                    editors[editor] += 1
+
+                sizestr = xd.sizestr()
+                if sizestr:
+                    formats[sizestr] += 1
+
+                copyright = xd.get_header('Copyright').strip()
+                if copyright:
+                    copyrights[copyright] += 1
+
+            # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats)))
+            def process_counter(count, comp_value):
+                # Process counter comparing with comp_value
+                if count:
+                    item, num  = count.most_common(1)[0]
+                    if num != comp_value:
+                        item += " (%s)" % num
+                else:
+                    item = ''
+                return item
+
+            #
+            maineditor = process_counter(editors, nxd)
+            maincopyright = process_counter(copyrights, nxd)
+            mainformat = process_counter(formats, nxd)
+
+            reprints = 0
+            touchups = 0
+            redones = 0
+            copies = 0
+            themecopies = 0
+            for r in byweekday_similar[weekday]:
+                # debug("Xdid %s Xdidmatch %s" % (r['xdid'], r['xdidMatch']))
+                xd1 = xdfile.get_xd(r['xdid'])
+                xd2 = xdfile.get_xd(r['xdidMatch'])
+                if xd1 is None or xd2 is None:
+                    continue
+                # debug("XD1: %s XD2: %s" % (xd1, xd2))
+                dt1 = xd1.get_header('Date')
+                dt2 = xd2.get_header('Date')
+                aut1 = xd1.get_header('Author')
+                aut2 = xd2.get_header('Author')
+                pct = int(r['GridMatchPct'])
+                if dt2 < dt1:  # only capture the later one
+                    if aut1 == aut2:
+                        if pct == 100:
+                            reprints += 1
+                        elif pct >= 50:
+                            touchups += 1
+                        elif pct >= 30:
+                            themecopies += 1
+                    else: # suspicious
+                        if pct >= 50:
+                            copies += 1
+                        elif pct >= 30:
+                            themecopies += 1
+
+            metasql.execute("INSERT INTO stats VALUES (?,?,?, ?,?,?, ?, ?,?,?, ?,?, ?,?)",
+                (pubid, year, weekday,
+                mainformat, maineditor, maincopyright,
+                nexisting, nxd, npublic,
+                reprints, touchups, redones,
+                copies, themecopies))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/30-mkwww.sh b/scripts/30-mkwww.sh
index e2789a8..1d4aa0d 100755
--- a/scripts/30-mkwww.sh
+++ b/scripts/30-mkwww.sh
@@ -15,6 +15,9 @@ cp $PUB/*.tsv $WWW/pub/
 echo -en "${GREEN}Generate /pub/[<pub>][<year>]${NORMAL}\n"
 scripts/31-mkwww-publishers.py $CORPUS -o $WWW/
 
+echo -en "${GREEN}Generate /pub/ index${NORMAL}\n"
+scripts/37-pubyear-svg.py -o $WWW/
+
 echo -en "${GREEN}Generate /pub/word/<ANSWER>${NORMAL}\n"
 scripts/33-mkwww-words.py $CORPUS -o $WWW/
 
diff --git a/scripts/31-mkwww-publishers.py b/scripts/31-mkwww-publishers.py
index 6a9ae6c..cf01c6c 100755
--- a/scripts/31-mkwww-publishers.py
+++ b/scripts/31-mkwww-publishers.py
@@ -6,7 +6,7 @@
 from xdfile.utils import progress, open_output, get_args, args_parser, COLUMN_SEPARATOR
 from xdfile.utils import br_with_n
 from xdfile import html, utils, catalog, pubyear
-from xdfile import metadatabase as metadb
+from xdfile import metadatabase as metadb, metasql
 from xdfile.html import GridCalendar, mktag, year_widget
 from xdfile.xdfile import num_cells
 import xdfile
@@ -25,10 +25,10 @@ def __init__(self, pubid):
         self.puzzles_meta = []
 
     def add(self, puzrow):
-        self.copyrights[puzrow.Copyright.strip()] += 1
-        self.editors[puzrow.Editor.strip()] += 1
-        self.formats[puzrow.Size] += 1
-        datestr = puzrow.Date
+        self.copyrights[puzrow['Copyright'].strip()] += 1
+        self.editors[puzrow['Editor'].strip()] += 1
+        self.formats[puzrow['Size']] += 1
+        datestr = puzrow['Date']
         if datestr:
             if not self.mindate:
                 self.mindate = datestr
@@ -80,13 +80,13 @@ def main():
     all_pubs = {}  # [(pubid,year)] -> PublicationStats
     pubyear_rows = {}
     similar = metadb.xd_similar()
-    puzzles = metadb.xd_puzzles()
+    puzzles = metasql.select('SELECT * FROM puzzles;')
     outf.write_html('pub/index.html', pubyear.pubyear_html(), title='The xd crossword puzzle corpus')
 
     utils.info("collating puzzles")
-    for puzrow in puzzles.values():
-            pubid = utils.parse_pubid(puzrow.xdid)
-            year = xdfile.year_from_date(puzrow.Date)
+    for puzrow in puzzles:
+            pubid = utils.parse_pubid(puzrow['xdid'])
+            year = xdfile.year_from_date(puzrow['Date'])
             k = (pubid, year or 9999)
             if k not in all_pubs:
                 all_pubs[k] = PublicationStats(pubid)
@@ -101,20 +101,18 @@ def main():
         c_grids = {}
         pubid, year = pair
         progress(pubid)
-   
         reused_clues = 0
         reused_answers = 0
         total_clues = 0
         total_similar = []
 
         rows = []
-        
         # Assign class based on xdid and similars
         def get_cell_classes(r):
             """ Return cell classes based on parameters """
             # TODO: Implement check that authors same
             classes = []
-            rsim = similar.get(r.xdid)
+            rsim = similar.get(r['xdid'])
             if rsim and float(rsim.similar_grid_pct) > 0:
                 matches = [x.split('=') for x in rsim.matches.split()]
                 # Get max for matches for class definition
@@ -129,10 +127,10 @@ def get_cell_classes(r):
                 if max_pct >= 100:
                     classes.append('exact')
                 # Highlight only grids sized > 400 cells
-                if num_cells(r.Size) >= 400:
+                if num_cells(r['Size']) >= 400:
                     classes.append('biggrid')
                 # Check for pub similarity
-                pubid, y, m, d = utils.split_xdid(r.xdid)
+                pubid, y, m, d = utils.split_xdid(r['xdid'])
                 if pubid:
                     ymd = '%s%s%s' % (y, m, d)
                     if pubid not in [ x[0] for x in matches ]:
@@ -148,14 +146,15 @@ def get_cell_classes(r):
             similar_text = ""
             reused_clue_pct = "n/a"
 
-            rsim = similar.get(r.xdid)
+            rsim = similar.get(r['xdid'])
             if rsim:
                 similar_pct = float(rsim.similar_grid_pct)
                 if similar_pct > 0:
                     matches = [x.split('=') for x in rsim.matches.split()]
                     for xdid, pct in matches:
-                        if xdid in puzzles.keys():
-                            similar_text += '(%s%%) %s [%s]<br/>' % (pct, puzzles[xdid].Author, xdid)
+                        if any(d['xdid'] == xdid for d in puzzles):
+                            author = [ d['Author'] for d in puzzles if d['xdid'] ==  xdid]
+                            similar_text += '(%s%%) %s [%s]<br/>' % (pct, author[0], xdid)
                     total_similar.append(similar_pct)
                 else:
                     similar_text = "0"
@@ -174,31 +173,32 @@ def get_cell_classes(r):
             row_dict = {} # Map row and style
             if similar_text and similar_text != "0":
                 # http://stackoverflow.com/questions/1418838/html-making-a-link-lead-to-the-anchor-centered-in-the-middle-of-the-page
-                pubidtext = '<span class="anchor" id="%s">' % r.xdid 
+                pubidtext = '<span class="anchor" id="%s">' % r['xdid']
                 pubidtext += '</span>'
-                pubidtext += html.mkhref(r.xdid, '/pub/' + r.xdid)
-                c_grids[r.Date] = { 
-                        'link' : '/pub/%s%s/index.html#' % (pubid, year) + r.xdid,
-                        'class': get_cell_classes(r), 
+                pubidtext += html.mkhref(r['xdid'], '/pub/' + r['xdid'])
+                c_grids[r['Date']] = {
+                        'link' : '/pub/%s%s/index.html#' % (pubid, year) + r['xdid'],
+                        'class': get_cell_classes(r),
                         'title': br_with_n(similar_text),
                         }
                 row_dict['tag_params'] = {
-                    'onclick': 'location.href=\'/pub/%s\'' % r.xdid,
+                    'onclick': 'location.href=\'/pub/%s\'' % r['xdid'],
                     'class': 'puzzlehl'
                     }
             else:
-                pubidtext = r.xdid
+                pubidtext = r['xdid']
                 row_dict['class'] = 'puzzle'
-           
-            row = [ 
+                continue  # don't display unique puzzles in table; refer to download/google sheet
+
+            row = [
                 pubidtext,
-                r.Date,
-                r.Size,
-                r.Title,
-                r.Author,
-                r.Editor,
-                r.Copyright,
-                r.A1_D1,
+                r['Date'],
+                r['Size'],
+                r['Title'],
+                r['Author'],
+                r['Editor'],
+                r['Copyright'],
+                r['A1_D1'],
                 reused_clue_pct,
                 similar_text
               ]
@@ -206,17 +206,14 @@ def get_cell_classes(r):
             outf.write_row('pub/%s%s.tsv' % (pubid, year), " ".join(pubyear_header), row)
             row_dict['row'] = row
             rows.append(row_dict)
-       
+
         pub_grids[pubid][year] = c_grids
 
-        # Generate calendar 
+        # Generate calendar
         onepubyear_html = GridCalendar(c_grids).formatyear(year, 6) + "<br>"
-        
         # Generate html table sorted by 2nd element of row (date)
-        onepubyear_html += html.html_table(sorted(rows , key=lambda x: x['row'][1]), pubyear_header, "puzzle", "puzzles")
+        onepubyear_html += html.html_table(sorted(rows, key=lambda x: x['row'][1]), pubyear_header, "puzzle", "puzzles")
         outf.write_html("pub/%s%s/index.html" % (pubid, year), onepubyear_html, title="%s %s" % (pubid, year))
-      
-        
         cluepct = ""
         wordpct = ""
         if total_clues:
diff --git a/scripts/33-mkwww-words.py b/scripts/33-mkwww-words.py
index a5268fe..6eca1e2 100755
--- a/scripts/33-mkwww-words.py
+++ b/scripts/33-mkwww-words.py
@@ -3,7 +3,7 @@
 from queries.similarity import find_similar_to, find_clue_variants, load_clues, load_answers
 from xdfile.utils import get_args, open_output, find_files, log, debug, get_log, COLUMN_SEPARATOR, EOL, parse_tsv, progress, parse_pathname
 from xdfile.html import th, td, mkhref, html_select_options
-from xdfile import corpus, clues, pubyear, metadatabase, utils
+from xdfile import corpus, clues, pubyear, metadatabase, utils, metasql
 
 from collections import Counter
 import random
@@ -13,8 +13,8 @@
 
 def xd_metadata_row(xdid):
     if not g_puzzles_md:
-        for r in metadatabase.xd_puzzles().values():
-            g_puzzles_md[r.xdid] = r
+        for r in metasql.select('SELECT * FROM puzzles;'):
+            g_puzzles_md[r['xdid']] = r
     return g_puzzles_md[xdid]
 
 
@@ -31,18 +31,17 @@ def mkwww_wordpage(answer):
     for ca in sorted(uses, reverse=True, key=lambda ca: ca.date):
         try:
             md = xd_metadata_row(ca.xdid())
-            h += td(md.xdid, ca.clue, md.Author, md.Copyright)
+            h += td(md['xdid'], ca.clue, md['Author'], md['Copyright'])
         except Exception as e:
-            h += td(ca.xdid(), ca.clue, str(e))
+            h += td(ca.xdid, ca.clue, str(e))
             if utils.get_args().debug:
                 raise
     h += '</table>'
 
 #    h += '<hr/>'
-#    h += '<div>Mutations: ' 
+#    h += '<div>Mutations: '
 #    h +='</div>'
-    
-    return h 
+    return h
 
 
 def main():
@@ -60,7 +59,6 @@ def main():
     h = '<li>%d different words</li>' % len(all_uses)
 
     h += '<h2>Most used words</h2>'
-    
     h += '<table class="clues most-used-words">'
     h += th("word", "# uses", "clues used with this answer")
 
@@ -78,5 +76,4 @@ def main():
         outf.write_html('pub/word/%s/index.html' % word.upper(), mkwww_wordpage(word), title=word)
 
     outf.write_html('pub/word/index.html', h, title="Words")
-        
 main()
diff --git a/scripts/37-pubyear-svg.py b/scripts/37-pubyear-svg.py
new file mode 100755
index 0000000..7937524
--- /dev/null
+++ b/scripts/37-pubyear-svg.py
@@ -0,0 +1,286 @@
+#!/usr/bin/python3
+
+import re
+import operator
+
+from datetime import date
+from xdfile import utils
+from xdfile import metasql as metadb
+from xdfile import html
+from xdfile.utils import space_with_nbsp
+import xdfile
+from collections import defaultdict, OrderedDict
+
+
+svg_w = 32
+svg_h = 35
+decade_scale=1.3
+
+pys = '''
+<svg class="year_widget" width="{w}px" height="{h}px">
+  <g transform="translate(0,0)">
+    <rect class="{classes}" width="{w}px" height="{h}px"></rect>
+  </g>
+{body}
+</svg>
+'''
+
+legend = '''
+Broken out by day-of-week (Monday at top, Sunday at bottom).  Thicker lines mean larger puzzles.
+<table>
+<tr><td class="dupxd">&nbsp;&nbsp;</td><td>50%+ grid match of an earlier puzzle, same author (reprint/resubmission)</td></tr>
+<tr><td class="themexd">&nbsp;&nbsp;</td><td>30-50% grid match of an earlier puzzle (likely theme copy)</td></tr>
+<tr><td class="suspxd">&nbsp;&nbsp;</td><td>50%+ grid match of an earlier puzzle, different author (suspicious)</td></tr>
+<tr><td><hr/></td></tr>
+<tr><td class="pubxd">&nbsp;&nbsp;</td><td>crosswords available for <a href="/download">public download</a></td></tr>
+<tr><td class="privxd">&nbsp;&nbsp;</td><td>crosswords currently not publicly available</td></tr>
+</table>
+<hr/>
+'''
+
+
+def rect(x, y, w, h, *classes):
+  return '<rect transform="translate({x},{y})" class="{classes}" width="{w}" height="{h}"></rect>\n'.format(x=int(x), y=int(y), w=int(w), h=int(h), classes=''.join(classes))
+
+
+def year_from(dt):
+    return int(dt.split('-')[0])
+
+
+def weekdays_between(dta, dtb):
+    return 0
+
+
+def pubyear_svg(rows, height=svg_h, width=svg_w, pubid='', year=''): #, nsusp, ndup, npub, npriv):
+    bgclass = "notexists"
+#    if bgclass not in publications.tsv:
+#       bgclass = "exists"
+
+    rects = ''
+    """
+    pubid CHAR(6),   -- "nyt"
+    year CHAR(4),    -- "2006"
+    weekday CHAR(3), -- "Mon"
+    Size TEXT, -- most common entry
+    Editor TEXT, -- most common entry
+    Copyright TEXT, -- most common, after removing Date/Author
+    NumExisting INTEGER, -- known or assumed to be in existence (0 means unknown)
+    NumXd INTEGER,       -- total number in xd
+    NumPublic INTEGER,   -- available for public download
+    -- duplicate grids, same author
+    NumReprints INTEGER, -- 100% grid match
+    NumTouchups INTEGER, -- 75-99% grid match
+    NumRedone INTEGER,   -- 30-75% grid match
+    -- duplicate grids, different author
+    NumSuspicious INTEGER, -- >50% similar grid
+    NumThemeCopies INTEGER -- >50% similar grid
+    """
+    row = rows[0]
+    svgtitle = '{} {}\n'.format(row['pubid'], row['year'])
+    svgtitle += 'Copyright: {}\n'.format(row['Copyright']) if row['Copyright'] else ''
+    svgtitle += 'Editor: {}'.format(row['Editor']) if row['Editor'] else ''
+
+    for i, wd in enumerate(utils.WEEKDAYS): #range(0, 7):
+        row = rows[i]
+        y = i*2 + 2
+        num_existing = 52 if 's' not in year else 520 # (eventually number of this weekday in that year, *10 for decades)
+
+        num_xd = row["NumXd"]
+
+        #dup_length is length of dup/orange line
+        num_dup = row['NumReprints'] + row['NumTouchups'] + row['NumRedone']
+
+        # susp_length is length of suspicious/red line
+        num_susp = row['NumSuspicious']
+        num_theme = row['NumThemeCopies']
+        # TODO: base color on suspicious vs theme (darker when only suspicious)
+
+        num_pub = row['NumPublic']
+
+        num_priv = num_xd - num_pub
+
+        pixel_prexd = 0
+        pixel_postxd = 0
+        if num_xd < num_existing:
+            # for now; eventually should use earliest/latest date and puzzle to determine which side has gap
+            # npre = weekdays_between(date(year_from(firstxd.Date), 1, 1), firstxd.Date, i)
+            # npost = weekdays_between(lastxd.Date, date(year_from(lastxd.Date), 12, 31), i)
+            pixel_prexd = 1
+            pixel_postxd = 1
+
+        if not num_xd or not num_existing:
+            continue
+
+        pixel_total = width - pixel_prexd - pixel_postxd
+
+        if num_xd <= num_existing:
+            pixel_xd = pixel_total * num_xd / num_existing
+        else:
+            pixel_xd = pixel_total
+
+        # then convert num_* to pixel_*, num_existing to pixel_total
+        pixel_susp = num_susp*pixel_xd/num_xd
+        pixel_theme = num_theme*pixel_xd/num_xd
+        pixel_dup = num_dup*pixel_xd/num_xd
+        pixel_pub = num_pub*pixel_xd/num_xd
+        pixel_priv = num_priv*pixel_xd/num_xd
+
+        if pixel_theme > 0 and pixel_theme < 1:
+            pixel_theme = 1
+        if pixel_susp > 0 and pixel_susp < 1:
+            pixel_susp = 1
+        if pixel_dup > 0 and pixel_dup < 1:
+            pixel_dup = 1
+
+        m = re.match(r'(\d+?)x(\d+?).*', row['Size'])
+        if m:
+            sz = int(m.group(1)) * int(m.group(2))
+            if sz > 17*17:
+                h = 4
+            else:
+                h = 1.5
+        else:
+            h = 1
+
+        x = 0
+        w = 6
+        rects += '''<g id="{}" transform="translate(0,{y})">'''.format(utils.WEEKDAYS[i],y=int(y))
+
+        w = pixel_prexd
+#        rects += rect(x, y, w, h, 'prexd')
+        x += w
+
+        w = pixel_susp
+        rects += rect(x, y, w, h, 'suspxd')
+        x += w
+
+        w = pixel_theme
+        rects += rect(x, y, w, h, 'themexd')
+        x += w
+
+        w = pixel_dup
+        rects += rect(x, y, w, h, 'dupxd')
+        x += w
+
+        if x <= pixel_total:
+            w = min(pixel_total - x, max(0, pixel_priv))
+            rects += rect(x, y, w, h, 'privxd')
+            x += w
+
+        if x <= pixel_total:
+            w = min(pixel_total - x, max(0, pixel_pub))
+            rects += rect(x, y, w, h, 'pubxd')
+            x += w
+
+#        w = pixel_postxd
+#        rects += rect(x, y, w, h, 'postxd')
+        rects += '</g>'
+    href = "/pub/%s%s" % (pubid, year) if 's' not in year else "/pub/%s/index.html#%s" % (pubid, year[:-1])
+    ret = html.mkhref(pys.format(w=width,h=height,classes=bgclass,body=rects), href, svgtitle)
+    return ret
+
+
+def main():
+    p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus")
+    p.add_argument('-a', '--all', default=False, help='analyze all puzzles, even those already in similar.tsv')
+    args = utils.get_args(parser=p)
+    outf = utils.open_output()
+
+    pubyears = defaultdict(list)
+    pubyears_idx = defaultdict(list)
+    # years_idx = []
+    for r in metadb.select("SELECT * FROM stats"):
+        y = r['year'] or '9999'
+        pubyear = r['pubid'] + y
+        pubyears[pubyear].append(r)
+        if y not in pubyears_idx[r['pubid']]:
+            pubyears_idx[r['pubid']].append(y)
+        # if r['year'] not in years_idx:
+        #    years_idx.append(r['year'])
+
+    # Making collapsed decades depends on args
+    skip_decades = None
+    skip_decades = skip_decades if skip_decades else { 'start': 1910, 'end': 1980 } 
+    allyears = []
+    for i in range(skip_decades['start']//10, skip_decades['end']//10 + 1):
+        allyears.append("%s0s" % i)
+    allyears.extend([ str(y) for y in range(skip_decades['end'] + 10, date.today().year + 1) ])
+
+    html_out = []
+    html_out.append(legend)
+    html_out.append('<table id="pubyearmap" cellspacing="0" cellpadding="0">')
+
+    # Table header with years \ decades
+    year_header = []
+    year_header.append('<tr><td>&nbsp;</td>')
+    for year in sorted(allyears):
+        if year[-1] == 's':
+            lead = ''
+            yclass = 'decade'
+        elif year[3] == '0':
+            lead = year[:2]
+            yclass = 'zero-year'
+        else:
+            lead = '&nbsp;'
+            yclass = 'ord-year'
+        year_header.append('<td class="{}">{}<br>{}</td>'.format(yclass, lead, year[2:]))
+    year_header.append('</tr>')
+    html_out.extend(year_header)
+
+    sorted_idx = OrderedDict(sorted(pubyears_idx.items(), key=lambda r: min(r[1])))
+    for pub in sorted_idx:
+        # Process each pub in index
+        pubobj = metadb.xd_publications().get(pub)
+        pubname = pubobj.PublicationName if pubobj else ''
+        html_out.append('<tr><td class="header">{}</td>'.format(html.mkhref((pubname or pub), pub)))
+        for year in sorted(allyears):
+            py = pub + year
+            py_svg = None
+            html_out.append('<td class="year_widget">')
+            if 's' not in year:
+                # Process for single year
+                if py in pubyears:
+                    py_svg = pubyear_svg(pubyears[py],pubid=pub,year=year)
+            else:
+                # Process for decade
+                decade = []
+                row_id = ['NumXd', 'NumReprints', 'NumTouchups', 'NumRedone', 'NumSuspicious', 'NumThemeCopies', 'NumPublic']
+                for wdi, wd in enumerate(utils.WEEKDAYS):
+                    wd_dict = {}
+                    wd_dict['weekday'] = wd
+                    wd_dict['pubid'] = pub
+                    wd_dict['year'] = year
+                    wd_dict['Copyright'] = ''
+                    wd_dict['Editor'] = ''
+                    wd_dict['Size'] = ''
+                    for dec_year in [year[:3]+str(y) for y in range(0,10)]:
+                        for rid in row_id:
+                            if pubyears[pub+dec_year]:
+                                if rid in wd_dict:
+                                    wd_dict[rid] += pubyears[pub+dec_year][wdi][rid]
+                                else:
+                                    wd_dict[rid] = pubyears[pub+dec_year][wdi][rid]
+                    # Emulate 7 rows per decade
+                    if row_id[0] in wd_dict:
+                        decade.append(wd_dict)
+                py_svg = pubyear_svg(decade, width=svg_w*decade_scale,year=year,pubid=pub) if decade else None
+
+            if py_svg:
+                html_out.append(py_svg)
+            else:
+                width = svg_w if 's' not in year else svg_w*decade_scale
+                html_out.append(pys.format(w=width,h=svg_h, title='', classes='notexists', body=''))
+
+            html_out.append('</td>')
+        # Add publishers
+        html_out.append('<td class="header">{}</td>'.format(html.mkhref((pubname or pub), pub)))
+        html_out.append('</tr>')
+
+    html_out.extend(year_header)
+    html_out.append('</table>')
+    outf.write_html('pub/index.html', "".join(html_out), "Published crosswords by year")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/40-deploy.sh b/scripts/40-deploy.sh
index 901ccbe..159bdb4 100755
--- a/scripts/40-deploy.sh
+++ b/scripts/40-deploy.sh
@@ -17,4 +17,3 @@ aws s3 sync --region $REGION $WWW ${S3WWW}/ --acl public-read
 ALLLOGS=$WWW/log/$TODAY-logs.txt
 scripts/49-cat-logs.py -o $ALLLOGS $PUB $TMP
 aws s3 cp --region $REGION $ALLLOGS ${S3WWW}/logs/ --acl public-read
-
diff --git a/scripts/41-git-commit.sh b/scripts/41-git-commit.sh
index e983880..5f152d2 100755
--- a/scripts/41-git-commit.sh
+++ b/scripts/41-git-commit.sh
@@ -34,6 +34,7 @@ else
     echo "SUMMARY: Commiting into master"
     git add .
     git commit -m "incoming for $TODAY"
+    ssh-agent bash -c "ssh-add ${SSHHOME}/.ssh/gxd_rsa; git push"
 fi
 
 
diff --git a/scripts/49-cat-logs.py b/scripts/49-cat-logs.py
index 9dc6b8b..4ebe438 100755
--- a/scripts/49-cat-logs.py
+++ b/scripts/49-cat-logs.py
@@ -6,7 +6,8 @@
 #  concatenates .log files (even those in subdirs or .zip) and combines into a single combined.log
 
 from xdfile.utils import find_files_with_time, open_output, get_args
-from boto.s3.connection import S3Connection
+import boto3
+# from boto.s3.connection import S3Connection
 import os
 
 
@@ -14,15 +15,15 @@ def main():
     args = get_args('aggregates all .log files')
     outf = open_output()
 
-    print(os.environ['AWS_ACCESS_KEY'],os.environ['AWS_SECRET_KEY'])
-    conn = S3Connection(aws_access_key_id=os.environ['AWS_ACCESS_KEY'], aws_secret_access_key=os.environ['AWS_SECRET_KEY'])
-    print(conn)
-    s3path = "s3://" + os.environ['BUCKET'] + "/logs/"
-    bucket = conn.get_bucket(s3path)
-    print(bucket, s3path)
-    for key in sorted(bucket.list(), key=lambda x: x.last_modified):
+    s3 = boto3.resource('s3')
+    s3path = "logs/"
+    # bucket = conn.get_bucket(s3path)
+    bucket = s3.Bucket(os.environ['BUCKET'])
+
+    for obj in sorted(bucket.objects.all(), key=lambda x: x.last_modified):
         # last_modified
-        print("Name: %s LastModified:%s" % (key.name.encode('utf-8'), key.last_modified))
+        if s3path in obj.key:
+            print("Name: %s LastModified:%s" % (obj.key.encode('utf-8'), obj.last_modified))
 
     for fn, contents, dt in sorted(find_files_with_time(*args.inputs, ext=".log"), key=lambda x: x[2]):  # earliest first
         outf.write_file(fn, contents.decode("utf-8"))
diff --git a/scripts/html/style.css b/scripts/html/style.css
index 5c0e30e..d2888d4 100644
--- a/scripts/html/style.css
+++ b/scripts/html/style.css
@@ -1,5 +1,57 @@
-.year_widget rect {
-    fill:green;
+#pubyearmap td {
+    border-right: 1px solid grey;
+    border-bottom: 1px solid grey;
+    text-align: center;
+}
+
+#pubyearmap td.header {
+    font-weight: bold;
+    font-size: 12px;
+    width: 60px;
+}
+
+#pubyearmap td.year_widget {
+    background: #a8a8a8;
+}
+
+#pubyearmap td.decade {
+    background: lightgrey;
+    color: black;
+/*    border-color: white; */
+    font-weight: bold;
+    font-size: 12px;
+}
+
+#pubyearmap td.zero-year {
+    background: lightgrey;
+    color: black;
+/*    border-color: white; */
+    font-weight: bold;
+    font-size: 12px;
+}
+
+#pubyearmap td.ord-year {
+    background: #ccf;
+    color: black;
+/*    border-color: white; */
+    font-size: 12px;
+}
+
+td:hover {
+    background:darkgray;
+}
+
+th {
+    font-weight: bold;
+    font-size: 12px;
+}
+
+.year_widget {
+    fill:#a8a8a8;
+}
+
+.year_widget {
+    overflow:visible;
 }
 
 .year_widget rect.red {
@@ -14,10 +66,40 @@
     fill:white;
 }
 
-.year_widget rect.similar10 {
+.similar10 {
     fill:lightsalmon;
 }
 
+/* Prexd */
+.notexist {
+    fill: white;
+}
+
+.suspxd {
+    fill:darkred;
+    background-color: darkred;
+}
+
+.dupxd {
+    fill:yellow;
+    background-color: yellow;
+}
+
+.themexd {
+    fill:orange;
+    background-color: orange;
+}
+
+.privxd {
+    fill:blue;
+    background-color: blue;
+}
+
+.pubxd {
+    fill:green;
+    background-color: green;
+}
+
 .year_widget text {
     fill: black;
     font: 10px sans-serif;
diff --git a/scripts/meta.sql b/scripts/meta.sql
index 5020264..4c139fd 100644
--- a/scripts/meta.sql
+++ b/scripts/meta.sql
@@ -1,11 +1,75 @@
-CREATE TABLE "receipts" (
-    "CaptureTime" TEXT,
-    "ReceivedTime" TEXT,
-    "ExternalSource" TEXT,
-    "InternalSource" TEXT,
-    "SourceFilename" TEXT,
-    "xdid" TEXT
+-- xd
+
+CREATE TABLE receipts (
+    CaptureTime TEXT,
+    ReceivedTime TEXT,
+    ExternalSource TEXT,
+    InternalSource TEXT,
+    SourceFilename TEXT,
+    xdid CHAR(16),
+    PRIMARY KEY (ExternalSource, SourceFilename)
+);
+
+CREATE INDEX XDID on receipts (xdid ASC);
+
+
+CREATE TABLE similar_grids (
+    xdid CHAR(16),
+    xdidMatch CHAR(16),
+    GridMatchPct INTEGER
+);
+
+
+CREATE TABLE similar_clues (
+    xdid CHAR(16),
+    reused_clues INTEGER,
+    reused_answers INTEGER,
+    total_clues INTEGER
 );
 
-CREATE INDEX "XDID" on receipts (xdid ASC);
+
+CREATE TABLE publications (
+    PublicationAbbr CHAR(8),
+    PublisherAbbr CHAR(8),
+    PublicationName TEXT,
+    PublisherName TEXT,
+    FirstIssueDate CHAR(10),
+    LastIssueDate CHAR(10),
+    NumberIssued INTEGER,
+    Contact TEXT,
+    Sources TEXT
+);
+
+
+CREATE TABLE puzzles (
+    xdid CHAR(16),  -- "eltana-001"
+    Date CHAR(10),  -- "2016-07-18"
+    Size CHAR(8),   -- "15x15RS" (Rebus/Special)
+    Title TEXT,
+    Author TEXT,
+    Editor TEXT,
+    Copyright TEXT,
+    A1_D1 TEXT
+);
+
+
+-- grouped by pub-year-weekday
+CREATE TABLE stats (
+    pubid CHAR(6),   -- "nyt"
+    year CHAR(4),    -- "2006"
+    weekday CHAR(3), -- "Mon"
+    Size TEXT, -- most common entry
+    Editor TEXT, -- most common entry
+    Copyright TEXT, -- most common, after removing Date/Author
+    NumExisting INTEGER, -- known or assumed to be in existence (0 means unknown)
+    NumXd INTEGER,       -- total number in xd
+    NumPublic INTEGER,   -- available for public download
+    -- duplicate grids, same author
+    NumReprints INTEGER, -- 100% grid match
+    NumTouchups INTEGER, -- 75-99% grid match
+    NumRedone INTEGER,   -- 30-75% grid match
+    -- duplicate grids, different author
+    NumSuspicious INTEGER, -- >50% similar grid
+    NumThemeCopies INTEGER -- 30-50% similar grid
+);
 
diff --git a/scripts/tsv2sqlite.py b/scripts/tsv2sqlite.py
index 86b9deb..6ca814f 100755
--- a/scripts/tsv2sqlite.py
+++ b/scripts/tsv2sqlite.py
@@ -11,17 +11,46 @@
 from xdfile import metadatabase as metadb
 
 
+# Map tsvtype and sql table
+sqlmap = {
+    'Receipt' : 'receipts',
+    'Publication' : 'publications',
+}
+
+
 def main():
     p = args_parser('convert .tsv to sqlite')
+    p.add_argument('--tsvtype', default=None, help='Tsv file type to import')
     args = get_args(parser=p)
 
-    sqlconn = sqlite3.connect(args.output)
-    cur = sqlconn.cursor()
+    if args.tsvtype is not None:
+    # Process only if tsvtype supplied
+        sqlconn = sqlite3.connect(args.output)
+        cur = sqlconn.cursor()
+        rows = [list(r) for r in xdfile.utils.parse_tsv_rows(args.inputs[0], args.tsvtype)]
+
+        if args.tsvtype == 'Similar':
+            # Fill up similar clues first
+            sclues = [[x[0], x[2], x[3], x[4]] for x in rows]
+            INS_TMPL = ",".join('?' * len(sclues[0]))
+            cur.executemany('INSERT INTO %s VALUES (%s)' % ('similar_clues', INS_TMPL), sclues)
+            # Fill up similar grids
+            sgrids = []
+            for r in rows:
+                if '=' in r[5]:
+                    for pos in r[5].split(' '):
+                        (xdidm, pctm) = pos.split('=')
+                        sgrids.append([r[0], xdidm, int(pctm)])
+
+            INS_TMPL = ",".join('?' * len(sgrids[0]))
+            cur.executemany('INSERT INTO %s VALUES (%s)' % ('similar_grids', INS_TMPL), sgrids)
+        else:
+            rows = [list(r) for r in xdfile.utils.parse_tsv_rows(args.inputs[0], args.tsvtype)]
 
-    rows = [list(r) for r in xdfile.utils.parse_tsv_rows(args.inputs[0], "Receipt")]
-    info("Rows to be inserted to sql: %s" % len(rows))
-    cur.executemany('INSERT INTO receipts VALUES (?,?,?,?,?,?)', rows)
-    sqlconn.commit()
+            info("Rows to be inserted to sql table [ %s ]: %s" % (sqlmap[args.tsvtype], len(rows)))
+            INS_TMPL = ",".join('?' * len(rows[0]))
+            cur.executemany('INSERT OR IGNORE INTO %s VALUES (%s)' % (sqlmap[args.tsvtype], INS_TMPL), rows)
+        sqlconn.commit()
 
 if __name__ == "__main__":
     main()
diff --git a/xdfile/cloud.py b/xdfile/cloud.py
index 77cd1b4..117e20c 100644
--- a/xdfile/cloud.py
+++ b/xdfile/cloud.py
@@ -1,6 +1,6 @@
 import boto3
 
-from xdfile.utils import log, info
+from xdfile.utils import log, info, debug, error
 
 def xd_send_email(destaddr, fromaddr='admin@xd.saul.pw', subject='', body=''):
     client = boto3.client('ses')
diff --git a/xdfile/metasql.py b/xdfile/metasql.py
index 60c71bc..6b8c049 100644
--- a/xdfile/metasql.py
+++ b/xdfile/metasql.py
@@ -13,6 +13,7 @@
 
 METADB = "meta.db" # SQLLite database
 METADB_RECEIPTS = "receipts" # Receipts table
+METADB_PUZZLES = 'puzzles' # Puzzles table
 
 RECEIPTS_TSV = "gxd/receipts.tsv"
 SIMILAR_TSV = "gxd/similar.tsv"
@@ -101,7 +102,12 @@ def xd_publications():
 
 @utils.memoize
 def xd_puzzles():
-    return utils.parse_tsv(PUZZLES_TSV, "Puzzle")
+    cursor.execute('SELECT * FROM %s' % (METADB_PUZZLES))
+    for c in cursor.fetchall():
+        print("C: %s" % c)
+
+
+    #return utils.parse_tsv(PUZZLES_TSV, "Puzzle")
 
 @utils.memoize
 def xd_similar():
@@ -135,12 +141,25 @@ def append_row(tsvpath, headerstr, row, to_sql=False):
         fp.write(COLSEP.join([str(x) for x in row]) + EOL)
         fp.close()
     else:
+        # tsvpath = SQL table
         cur = sqlconn.cursor()
-        INS_TMPL = ",".join('?' * len(COLSEP.split(headerstr)))
-        cur.execute("INSERT INTO %s VALUES (%s)" % (METADB_RECEIPTS, INS_TMPL),([str(x) for x in row]))
+        INS_TMPL = ",".join('?' * len(row))
+        cur.execute("INSERT INTO %s VALUES (%s)" % (tsvpath, INS_TMPL),([str(x) for x in row]))
         sqlconn.commit()
 
 
+def select(query, *args):
+    # Execute SQL statement w/o commit 
+    cursor.execute(query, *args)
+    return cursor.fetchall()
+
+def execute(query, *args):
+    # Execute SQL statement with commit
+    cursor.execute(query, *args)
+    sqlconn.commit()
+    # return cursor.fetchall()
+
+
 def check_already_recieved(extsrc, srcfn):
     cursor.execute('SELECT * FROM %s WHERE ExternalSource="%s" AND SourceFilename="%s"' % (METADB_RECEIPTS, extsrc, srcfn))
     return [ i for i in map(xd_receipt._make, cursor.fetchall())]
@@ -177,12 +196,32 @@ def xd_sources_row(SourceFilename, ExternalSource, DownloadTime):
 def xd_recent_download(pubid, dt):
     return COLSEP.join([ pubid, dt ]) + EOL
 
+def append_puzzles(puzzles):
+    tmplist = []
+    for xd in puzzles:
+        fields = [
+            xd.xdid(), # xdid
+            xd.get_header("Date"),
+            "%dx%d%s%s" % (xd.width(), xd.height(), xd.get_header("Rebus") and "R" or "", xd.get_header("Special") and "S" or ""),
+            xd.get_header("Title"),
+            xd.get_header("Author") or xd.get_header("Creator"),
+            xd.get_header("Editor"),
+            xd.get_header("Copyright"),
+            "%s_%s" % (xd.get_answer("A1"), xd.get_answer("D1"))
+        ]
+        tmplist.append(fields)
+
+    INS_TMPL = ",".join('?' * len(tmplist[0]))
+    utils.debug('Going to insert %s rows' % len(tmplist))
+    cursor.executemany("INSERT INTO %s VALUES (%s)" % (METADB_PUZZLES, INS_TMPL), tmplist) 
+    sqlconn.commit()
 
 def update_puzzles_row(xd):
     # INSERT only for now
     if xd.xdid() in xd_puzzles():
         raise Error('record already exists; UPDATE not implemented')
 
+    #print("XD: %s" % xd)
     fields = [
         xd.xdid(),                   # xdid
         xd.get_header("Date"),
@@ -195,9 +234,10 @@ def update_puzzles_row(xd):
         "%s_%s" % (xd.get_answer("A1"), xd.get_answer("D1"))
     ]
 
-    assert COLSEP not in "".join(fields), fields
+    # print(fields)
+    # assert COLSEP not in "".join(fields), fields
 
-    append_row(PUZZLES_TSV, xd_puzzles_header, fields)
+    append_row(METADB_PUZZLES, xd_puzzles_header, fields, to_sql=True)
 
 
 class Publication:
diff --git a/xdfile/pubyear.py b/xdfile/pubyear.py
index d72eb50..52bcd99 100644
--- a/xdfile/pubyear.py
+++ b/xdfile/pubyear.py
@@ -1,10 +1,8 @@
-import cgi
-from collections import Counter, defaultdict
+from collections import defaultdict
 
-from xdfile.html import th, td, mkhref, mktag, tr_empty, td_with_class, year_widget, decade_widget
+from xdfile.html import mkhref, mktag, tr_empty, td_with_class
 from xdfile import utils, metadatabase as metadb
 from xdfile.utils import space_with_nbsp
-import xdfile
 from datetime import date
 
 
@@ -18,13 +16,13 @@ def mkcell(text, href="", title=""):
 def split_year(y):
     lsy = str(y)[2:]
     if y[3] != '0':
-        #msy = ' '  # unicode M space
-        msy = '&nbsp;' # Changed to &nbsp;
+        msy = '&nbsp;'  # Changed to &nbsp;
     else:
         msy = str(y)[:2]
 
     return "%s<br/>%s" % (msy, lsy)
 
+
 def get_pubheader_classes(*years):
     """
     Assign classes to years header
@@ -34,9 +32,34 @@ def get_pubheader_classes(*years):
         if "&nbsp" in str(y):
             classes.append("ord-year")
         else:
-            classes.append("zero-year")    
+            classes.append("zero-year")
     return classes
-        
+
+
+def year_widget(dow_dict, total, fill_class=None):
+    # Generate SVG based widget for day of week dispersion for year
+    fill_class = fill_class or 'white'
+    b = []
+    b.append('<svg class="year_widget" width="30" height="30">')
+    b.append('<g transform="translate(0,0)"><rect class="%s" width="30" height="30"></rect></g>' % fill_class)
+    for i, v in enumerate(utils.WEEKDAYS):
+        _class = dow_dict[v]['class'] if 'class' in dow_dict[v].keys() else ''
+        _length = str(dow_dict[v]['count']) if 'count' in dow_dict[v].keys() else '0'
+        _length = _length if  int(_length) < 26 else '30' # for all 52/2 have full filled row
+        b.append('<g transform="translate(0,' + str(i*3+i) + ')"><rect class="' + _class + '" width="' + _length + '" height="3"></rect></g>')
+    b.append('</svg>')
+    return(' '.join(b))
+
+def decade_widget(total, fill_class=None):
+    # Generate SVG based widget for decade showing total
+    fill_class = fill_class or 'green'
+    b = []
+    b.append('<svg class="year_widget" width="30" height="30">')
+    b.append('<g transform="translate(0,0)"><rect class="%s" width="30" height="30"></rect></g>' % fill_class)
+    b.append('<text x="25" y="18">' + str(total) + '</text>')
+    b.append('</svg>')
+    return(' '.join(b))
+
 
 g_all_pubyears = None
 def pubyear_html(pubyears=[], skip_decades=None):
@@ -47,9 +70,9 @@ def pubyear_html(pubyears=[], skip_decades=None):
     if not g_all_pubyears:
         g_all_pubyears = utils.parse_tsv_data(open("pub/pubyears.tsv").read(), "pubyear")
 
-    
+
     # Read similars to make background of widgets
-    similar_d = defaultdict(dict) 
+    similar_d = defaultdict(dict)
     for xdid, v in utils.parse_tsv('gxd/similar.tsv', "similar").items():
         xd_split = utils.split_xdid(xdid)
         if xd_split:
@@ -61,7 +84,7 @@ def pubyear_html(pubyears=[], skip_decades=None):
 
     b = [] # Body
     
-    # Making collapsed decaded depends on args
+    # Making collapsed decades depends on args
     skip_decades = skip_decades if skip_decades else { 'start': 1910, 'end': 1970 } 
     allyears = []
     for i in range(skip_decades['start']//10, skip_decades['end']//10 + 1):
@@ -108,25 +131,25 @@ def pubyear_html(pubyears=[], skip_decades=None):
                     'hint': hint,
                     'total': int(total),
                     }
-    
+
     # main table
     b.append('<table class="pubyears">')
     yhdr = [ '&nbsp;' ] + [ split_year(y) for y in allyears ]
     yhdr.append("all")
     b.append(td_with_class(*yhdr, classes=get_pubheader_classes(*yhdr),
             rowclass="pubyearhead",tag="th"))
-    b.append(tr_empty()) 
-   
-    # Process each pubid sorted by earliest year 
+    b.append(tr_empty())
+
+    # Process each pubid sorted by earliest year
     for pubid in sorted(pubs, key=lambda x:min(pubs[x])):
         pub = metadb.xd_publications().get(pubid)
         pubname = pub.PublicationName if pub else ''
-        # Pub id to first column 
+        # Pub id to first column
         b.append(mktag('tr'))
         b.append(mktag('td','pub'))
         b.append(mkcell(space_with_nbsp(pubname or pubid), "/pub/" + pubid, ))
         b.append(mktag('/td'))
-       
+
         # Process each year not collapsed into decade
         for yi in allyears:
             if yi in pubs[pubid] and pubs[pubid][yi]['total'] > 0:
@@ -140,11 +163,11 @@ def pubyear_html(pubyears=[], skip_decades=None):
                 b.append(mktag('td', 'block'))
                 b.append('&nbsp;')
                 b.append(mktag('/td'))
-                
+
         b.append(mktag('td'))
         b.append(str(sum([ pubs[pubid][x]['total'] for x in pubs[pubid].keys() ])))
         b.append(mktag('/td'))
         b.append(mktag('/tr'))
-   
+
     b.append(mktag('/table'))
     return (" ".join(b))
diff --git a/xdfile/xdfile.py b/xdfile/xdfile.py
index 98adba8..7a50801 100755
--- a/xdfile/xdfile.py
+++ b/xdfile/xdfile.py
@@ -77,6 +77,9 @@ def height(self):
     def size(self):
         return (self.width(), self.height())
 
+    def sizestr(self):
+        return "%dx%d%s%s" % (self.width(), self.height(), self.get_header("Rebus") and "R" or "", self.get_header("Special") and "S" or "")
+
     def xdid(self):
         num = self.get_header("Number")
         if num:
@@ -491,7 +494,7 @@ def get_xd(xdid):
     try:
         xd = xdfile(corpus_contents()[xdid].decode("utf-8"), xdid)
     except Exception as e:
-        error("get_xd() %s" % str(e))
+        # error("get_xd() %s" % str(e))
         return None
     return xd