diff --git a/AWS.md b/AWS.md new file mode 100644 index 0000000..8e707f4 --- /dev/null +++ b/AWS.md @@ -0,0 +1,22 @@ +# AWS commands + +## Extend root volume of instance +1) Stop instance + +2) Create a snapshot +aws ec2 create-snapshot --volume-id {cur_root_volume} --description 'Initial snapshot' + +3) Create new root volume; get snapshot id from previous output +aws ec2 create-volume --size {new_size_gb} --region us-west-2 --availability-zone us-west-2a --volume-type standard --snapshot-id {snapshot_id_of_root_volume} + +4) Detach existing root volume +aws ec2 detach-volume --volume-id {cur_root_volume} + +5) Attach new root volume +aws ec2 attach-volume --volume-id {new_root_volume_step3} --instance-id i-5fc17aca --device /dev/sda1 + +6) Start instance and process with partition extension +http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-expand-volume.html#recognize-expanded-volume-linux + +7) If all ok, delete old root volume +ws ec2 delete-volume --volume-id {cur_root_volume_step2} diff --git a/scripts/00-aws-bootstrap.sh b/scripts/00-aws-bootstrap.sh index 556f598..0d3d501 100755 --- a/scripts/00-aws-bootstrap.sh +++ b/scripts/00-aws-bootstrap.sh @@ -52,7 +52,7 @@ echo "Import all .tsv to sql" scripts/05-sql-import-receipts.sh echo "Run deploy script" -/bin/bash -x scripts/05-full-pipeline.sh +/bin/bash scripts/05-full-pipeline.sh echo 'SUMMARY: End time '`date +'%Y-%m-%d %H:%M'` # Parse log to get summary to be mailed diff --git a/scripts/00-aws-ebs-bootstrap.sh b/scripts/00-aws-ebs-bootstrap.sh new file mode 100755 index 0000000..8130ff1 --- /dev/null +++ b/scripts/00-aws-ebs-bootstrap.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +set -x + +WORKDIR=/tmp +export SSHHOME=$HOME + +if [ -z "$HOME" ] ; then + export SSHHOME=$HOME + HOME=/tmp + # Hack for AWS where HOME not set + if [[ $UID -eq '0' ]]; then + export SSHHOME=/root + fi +fi + +# This script is passed as userdata to the launch-config, which the base AMI +# executes at the end of initialization. + +export LC_ALL="en_US.UTF-8" +export LOGFILE=/tmp/`date +"%Y-%m-%d"`.log +export SUMLOGFILE=/tmp/`date +"%Y-%m-%d"`summary.log +# To run xdfile based scripts below +export PYTHONPATH=. + +exec > >(tee -i ${LOGFILE}) 2>&1 +echo 'SUMMARY: Start time:'`date +'%Y-%m-%d %H:%M'` + +# export DEBIAN_FRONTEND=noninteractive +# sudo apt-get update && \ +# sudo apt-get install --yes language-pack-en-base zip awscli python3-lxml python3-pip git markdown python3-boto3 sqlite3 && \ +# sudo pip3 install cssselect botocore + +cd $WORKDIR +# Get config file from AWS +aws s3 cp --region=us-west-2 s3://xd-private/etc/config $WORKDIR/config +source $WORKDIR/config + +echo "Clone main project repo and switch to branch ${BRANCH}" +git clone ${XD_GIT} +cd xd/ +git checkout ${BRANCH} +# Export all config vars +source scripts/config-vars.sh + +mkdir -p $SSHHOME/.ssh +echo "Clone GXD repo" +aws s3 cp --region=us-west-2 s3://xd-private/etc/gxd_rsa $SSHHOME/.ssh/ +chmod 600 $SSHHOME/.ssh/gxd_rsa + +cat src/aws/ssh_config >> $SSHHOME/.ssh/config +ssh-agent bash -c "ssh-add $SSHHOME/.ssh/gxd_rsa; git clone ${GXD_GIT}" + +echo "Import all .tsv to sql" +scripts/05-sql-import-receipts.sh + +echo "Run deploy script" +/bin/bash scripts/05-full-pipeline.sh + +echo 'SUMMARY: End time '`date +'%Y-%m-%d %H:%M'` +# Parse log to get summary to be mailed +egrep -i 'ERROR|WARNING|SUMMARY' ${LOGFILE} > ${SUMLOGFILE} +echo -e '\n' >> ${SUMLOGFILE} + +echo "Getting summary" +scripts/48-stats.sh >> ${SUMLOGFILE} +echo -e '\n' >> ${SUMLOGFILE} + +echo "SUMMARY: Full log file http://$BUCKET/logs/`basename ${LOGFILE}`" >> ${SUMLOGFILE} + +echo "Sending email" +scripts/send-email.py $ADMIN_EMAIL "execution logs for $TODAY" ${SUMLOGFILE} + +echo "Copy logs to AWS" +aws s3 cp --region ${REGION} --content-type='text/plain' ${LOGFILE} s3://${BUCKET}/logs/ --acl public-read +aws s3 cp --region ${REGION} --content-type='text/plain' ${SUMLOGFILE} s3://${BUCKET}/logs/ --acl public-read + +echo "Make logs index page" +scripts/49b-mkindex.sh diff --git a/scripts/00-aws-ec2-launch-manual-ebs.sh b/scripts/00-aws-ec2-launch-manual-ebs.sh new file mode 100755 index 0000000..d9dda62 --- /dev/null +++ b/scripts/00-aws-ec2-launch-manual-ebs.sh @@ -0,0 +1,58 @@ +#!/bin/bash -x +# +# Usage: $0 +# see format below +# +# export KEY= +# export BRANCH= +# export REGION= +# export AWS_ACCESS_KEY= +# export AWS_SECRET_KEY= +# export BUCKET= +# export EMAIL= +# export XD_GIT= +# export GXD_GIT= +# export XD_PROFILE= +# export AMI_ID=ami-75fd3b15 #Ubuntu Server 16.04 LTS (HVM) +# export SSH_SECURITY_GID=sg-e00fbe87 # SSH access +# export INSTANCE_TYPE=r3.large +# export QUICKRUN=True # For quickrun scipping 20- and 30- scripts +# +#source src/aws/config + +aws="aws" +sh="bash" + +XDCONFIG=$1 +if [ -n "$XDCONFIG" ]; then + aws s3 cp $XDCONFIG s3://xd-private/etc/config + source ${XDCONFIG} + # AMIID - 16.04 LTS amd64 hvm:ebs-ssd + # https://cloud-images.ubuntu.com/locator/ec2/ + AMI_ID=ami-9ece19fe + INSTANCE_JSON=/tmp/instance.json + + # created via IAM console: role/xd-scraper + $aws ec2 run-instances \ + --key-name $KEY \ + --region ${REGION} \ + --instance-type ${INSTANCE_TYPE} \ + --block-device-mappings '[{"DeviceName":"/dev/sda1","Ebs":{"DeleteOnTermination":false}}]' \ + --instance-initiated-shutdown-behavior stop \ + --iam-instance-profile Arn="$XD_PROFILE" \ + --user-data file://scripts/00-aws-bootstrap.sh \ + --image-id ${AMI_ID} > $INSTANCE_JSON + + # Wait a litte before applying sec group + sleep 30 + instance_id=$(cat $INSTANCE_JSON | jq -r .Instances[0].InstanceId) + $aws ec2 modify-instance-attribute --groups ${SSH_SECURITY_GID} --instance-id $instance_id + + public_ip=$(aws ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].PublicIpAddress') + echo "Connecting: ssh -i ~/*.pem ubuntu@$public_ip" + ssh -i ~/*.pem ubuntu@$public_ip + +else + echo "Supply config file: $0 " + exit 1 +fi diff --git a/scripts/00-ebs-snapshots.sh b/scripts/00-ebs-snapshots.sh new file mode 100755 index 0000000..da9697e --- /dev/null +++ b/scripts/00-ebs-snapshots.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# +# Manage snapshots for EBS storage +# usage: $0 + +instance_id=$1 + +instance_status=$(aws ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].State') +volume_id=$(aws ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].BlockDeviceMappings[0].Ebs.VolumeId') + +echo "Instance status" +echo "${instance_status}" + +# Get all snapshots for volume +echo "Snapshots" +aws ec2 describe-snapshots --filter Name=volume-id,Values=${volume_id} diff --git a/scripts/00-ebs-start.sh b/scripts/00-ebs-start.sh new file mode 100755 index 0000000..712ad15 --- /dev/null +++ b/scripts/00-ebs-start.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Start ebs based instance +# Usage: $0 +# + +instance_id=$1 +aws ec2 start-instances --instance-ids ${instance_id} +sleep 10 + +instance_status=$(aws ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].State') + +echo ${instance_status} + +public_ip=$(aws ec2 describe-instances --instance-ids ${instance_id} | jq -r '.Reservations[0].Instances[0].PublicIpAddress') + +echo "Connect in few seconds: ssh -i ~/*.pem ubuntu@$public_ip" diff --git a/scripts/05-full-pipeline.sh b/scripts/05-full-pipeline.sh index b441d85..848a401 100755 --- a/scripts/05-full-pipeline.sh +++ b/scripts/05-full-pipeline.sh @@ -34,7 +34,7 @@ if [ ! -n "$QUICKRUN" ]; then fi # commit new puzzles and saved analysis results -/bin/bash scripts/41-git-commit.sh incoming_$NOW +/bin/bash scripts/41-git-commit.sh # capture all logs even if other scripts fail scripts/39-mkwww-logs.py -o $WWW/$NOW/index.html $TMP diff --git a/scripts/05-sql-import-receipts.sh b/scripts/05-sql-import-receipts.sh index e042226..ed76a71 100755 --- a/scripts/05-sql-import-receipts.sh +++ b/scripts/05-sql-import-receipts.sh @@ -6,5 +6,11 @@ METADB=meta.db if [ ! -f $METADB ] ; then sqlite3 $METADB < ./scripts/meta.sql - ./scripts/tsv2sqlite.py ${DEBUG} -o ${METADB} gxd/receipts.tsv + ./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Receipt" -o ${METADB} gxd/receipts.tsv + ./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Publication" -o ${METADB} gxd/publications.tsv + ./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Similar" -o ${METADB} gxd/similar.tsv +else + echo "$METADB already exists" fi + + diff --git a/scripts/20-analyze.sh b/scripts/20-analyze.sh index 5c57682..43a9d62 100755 --- a/scripts/20-analyze.sh +++ b/scripts/20-analyze.sh @@ -8,9 +8,12 @@ mkdir -p $PUB rm -f $PUB/* # regenerate pub/puzzles.tsv -scripts/21-clean-metadata.py -o $PUB/puzzles.tsv $GXD +# TODO: should populate puzzles table in sqlite instead +scripts/21b-clean-metadata.py $GXD -# regenerate pub/pubyears.tsv +# generate pubyears just for now TODO: to be replaced scripts/22-pubyears.py +# regenerate pub/pubyears.tsv scripts/25-analyze-puzzle.py -o $WWW/ -c $GXD $GXD scripts/26-clues-tsv.py -c $GXD -o $PUB/ +scripts/27-pubyear-stats.py -c ${GXD} diff --git a/scripts/21b-clean-metadata.py b/scripts/21b-clean-metadata.py new file mode 100755 index 0000000..eb47522 --- /dev/null +++ b/scripts/21b-clean-metadata.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 + +# Usage: $0 [-o ] +# +# Generates puzzles.tsv with cleaned metadata for each .xd in . +# + +from xdfile import utils, metasql as metadb +import xdfile +import re + + +CLEAN_SUFFIX = '_clean' + + +def find_date(s): + m = re.search(r"\s*(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)?\s*(\d{1,2})?,?\s*\d{4},?\s*", s, flags=re.IGNORECASE) + if m: + return m.group(0) + + m = re.search(r"\d{2}[/\-]?\d{2}[/\-]?\d{2,4}", s) + if m: + return m.group(0) + + return "" + + +def boil_copyright(copyright, author): + import re + if author: + copyright = copyright.replace(author, "") + + # and remove textual date + dt = find_date(copyright) + if dt: + copyright = copyright.replace(dt, " ") + +# copyright = copyright.replace(u"©", "(c)") + + return copyright + + +# also editor +def clean_author(author, editor): + if author: + r = r'(?i)(?:(?:By )*(.+)(?:[;/,-]|and) *)?(?:edited|Editor|(? int(args.limit): + break + if similar_grids: info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct)) for pct, xd1, xd2 in similar_grids)) diff --git a/scripts/27-pubyear-stats.py b/scripts/27-pubyear-stats.py new file mode 100755 index 0000000..526a3da --- /dev/null +++ b/scripts/27-pubyear-stats.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 + +import json +import re +from collections import defaultdict, Counter + +from xdfile.utils import error, debug, info +from xdfile import utils, metasql, metadatabase as metadb +from xdfile import year_from_date, dow_from_date +import xdfile + + + +def main(): + args = utils.get_args('generate pub-years data') + outf = utils.open_output() + + weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ] + + pubyears = {} # set() + for xd in xdfile.corpus(): + puby = (xd.publication_id(), xd.year()) + if puby not in pubyears: + pubyears[puby] = [] + pubyears[puby].append(xd) + + if pubyears: + metasql.execute("DELETE FROM stats;") + + for puby, xdlist in sorted(pubyears.items()): + pubid, year = puby + npublic = 0 + + # TODO: SELECT FROM publications + nexisting = 0 + + # organize by day-of-week + byweekday = {} + byweekday_similar = {} + for w in weekdays: + byweekday[w] = [] + byweekday_similar[w] = [] + + for xd in xdlist: + dow = dow_from_date(xd.get_header('Date')) + if dow: # Might be empty date or only a year + byweekday[dow].append(xd) + + for r in metasql.select("SELECT * FROM similar_grids WHERE xdid LIKE '{}%' AND GridMatchPct > 25".format(pubid + str(year))): + xd = xdfile.get_xd(r['xdid']) + if xd: + dt = xd.get_header('Date') + if dt: + assert dt + dow = dow_from_date(dt) + if dow: # Might be empty date or only a year + byweekday_similar[dow].append(r) + else: + debug("Date not set for: %s" % xd) + + # tally stats + for weekday in weekdays: + copyrights = Counter() # [copyright_text] -> number of xd + editors = Counter() # [editor_name] -> number of xd + formats = Counter() # ["15x15 RS"] -> number of xd + # todo + nexisting = 0 + + nxd = len(byweekday[weekday]) + public_xdids = [] # Empty for now + for xd in byweekday[weekday]: + xdid = xd.xdid() + if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids: + npublic += 1 + + editor = xd.get_header('Editor').strip() + if editor: + editors[editor] += 1 + + sizestr = xd.sizestr() + if sizestr: + formats[sizestr] += 1 + + copyright = xd.get_header('Copyright').strip() + if copyright: + copyrights[copyright] += 1 + + # debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats))) + def process_counter(count, comp_value): + # Process counter comparing with comp_value + if count: + item, num = count.most_common(1)[0] + if num != comp_value: + item += " (%s)" % num + else: + item = '' + return item + + # + maineditor = process_counter(editors, nxd) + maincopyright = process_counter(copyrights, nxd) + mainformat = process_counter(formats, nxd) + + reprints = 0 + touchups = 0 + redones = 0 + copies = 0 + themecopies = 0 + for r in byweekday_similar[weekday]: + # debug("Xdid %s Xdidmatch %s" % (r['xdid'], r['xdidMatch'])) + xd1 = xdfile.get_xd(r['xdid']) + xd2 = xdfile.get_xd(r['xdidMatch']) + if xd1 is None or xd2 is None: + continue + # debug("XD1: %s XD2: %s" % (xd1, xd2)) + dt1 = xd1.get_header('Date') + dt2 = xd2.get_header('Date') + aut1 = xd1.get_header('Author') + aut2 = xd2.get_header('Author') + pct = int(r['GridMatchPct']) + if dt2 < dt1: # only capture the later one + if aut1 == aut2: + if pct == 100: + reprints += 1 + elif pct >= 50: + touchups += 1 + elif pct >= 30: + themecopies += 1 + else: # suspicious + if pct >= 50: + copies += 1 + elif pct >= 30: + themecopies += 1 + + metasql.execute("INSERT INTO stats VALUES (?,?,?, ?,?,?, ?, ?,?,?, ?,?, ?,?)", + (pubid, year, weekday, + mainformat, maineditor, maincopyright, + nexisting, nxd, npublic, + reprints, touchups, redones, + copies, themecopies)) + + +if __name__ == "__main__": + main() diff --git a/scripts/30-mkwww.sh b/scripts/30-mkwww.sh index e2789a8..1d4aa0d 100755 --- a/scripts/30-mkwww.sh +++ b/scripts/30-mkwww.sh @@ -15,6 +15,9 @@ cp $PUB/*.tsv $WWW/pub/ echo -en "${GREEN}Generate /pub/[][]${NORMAL}\n" scripts/31-mkwww-publishers.py $CORPUS -o $WWW/ +echo -en "${GREEN}Generate /pub/ index${NORMAL}\n" +scripts/37-pubyear-svg.py -o $WWW/ + echo -en "${GREEN}Generate /pub/word/${NORMAL}\n" scripts/33-mkwww-words.py $CORPUS -o $WWW/ diff --git a/scripts/31-mkwww-publishers.py b/scripts/31-mkwww-publishers.py index 6a9ae6c..cf01c6c 100755 --- a/scripts/31-mkwww-publishers.py +++ b/scripts/31-mkwww-publishers.py @@ -6,7 +6,7 @@ from xdfile.utils import progress, open_output, get_args, args_parser, COLUMN_SEPARATOR from xdfile.utils import br_with_n from xdfile import html, utils, catalog, pubyear -from xdfile import metadatabase as metadb +from xdfile import metadatabase as metadb, metasql from xdfile.html import GridCalendar, mktag, year_widget from xdfile.xdfile import num_cells import xdfile @@ -25,10 +25,10 @@ def __init__(self, pubid): self.puzzles_meta = [] def add(self, puzrow): - self.copyrights[puzrow.Copyright.strip()] += 1 - self.editors[puzrow.Editor.strip()] += 1 - self.formats[puzrow.Size] += 1 - datestr = puzrow.Date + self.copyrights[puzrow['Copyright'].strip()] += 1 + self.editors[puzrow['Editor'].strip()] += 1 + self.formats[puzrow['Size']] += 1 + datestr = puzrow['Date'] if datestr: if not self.mindate: self.mindate = datestr @@ -80,13 +80,13 @@ def main(): all_pubs = {} # [(pubid,year)] -> PublicationStats pubyear_rows = {} similar = metadb.xd_similar() - puzzles = metadb.xd_puzzles() + puzzles = metasql.select('SELECT * FROM puzzles;') outf.write_html('pub/index.html', pubyear.pubyear_html(), title='The xd crossword puzzle corpus') utils.info("collating puzzles") - for puzrow in puzzles.values(): - pubid = utils.parse_pubid(puzrow.xdid) - year = xdfile.year_from_date(puzrow.Date) + for puzrow in puzzles: + pubid = utils.parse_pubid(puzrow['xdid']) + year = xdfile.year_from_date(puzrow['Date']) k = (pubid, year or 9999) if k not in all_pubs: all_pubs[k] = PublicationStats(pubid) @@ -101,20 +101,18 @@ def main(): c_grids = {} pubid, year = pair progress(pubid) - reused_clues = 0 reused_answers = 0 total_clues = 0 total_similar = [] rows = [] - # Assign class based on xdid and similars def get_cell_classes(r): """ Return cell classes based on parameters """ # TODO: Implement check that authors same classes = [] - rsim = similar.get(r.xdid) + rsim = similar.get(r['xdid']) if rsim and float(rsim.similar_grid_pct) > 0: matches = [x.split('=') for x in rsim.matches.split()] # Get max for matches for class definition @@ -129,10 +127,10 @@ def get_cell_classes(r): if max_pct >= 100: classes.append('exact') # Highlight only grids sized > 400 cells - if num_cells(r.Size) >= 400: + if num_cells(r['Size']) >= 400: classes.append('biggrid') # Check for pub similarity - pubid, y, m, d = utils.split_xdid(r.xdid) + pubid, y, m, d = utils.split_xdid(r['xdid']) if pubid: ymd = '%s%s%s' % (y, m, d) if pubid not in [ x[0] for x in matches ]: @@ -148,14 +146,15 @@ def get_cell_classes(r): similar_text = "" reused_clue_pct = "n/a" - rsim = similar.get(r.xdid) + rsim = similar.get(r['xdid']) if rsim: similar_pct = float(rsim.similar_grid_pct) if similar_pct > 0: matches = [x.split('=') for x in rsim.matches.split()] for xdid, pct in matches: - if xdid in puzzles.keys(): - similar_text += '(%s%%) %s [%s]
' % (pct, puzzles[xdid].Author, xdid) + if any(d['xdid'] == xdid for d in puzzles): + author = [ d['Author'] for d in puzzles if d['xdid'] == xdid] + similar_text += '(%s%%) %s [%s]
' % (pct, author[0], xdid) total_similar.append(similar_pct) else: similar_text = "0" @@ -174,31 +173,32 @@ def get_cell_classes(r): row_dict = {} # Map row and style if similar_text and similar_text != "0": # http://stackoverflow.com/questions/1418838/html-making-a-link-lead-to-the-anchor-centered-in-the-middle-of-the-page - pubidtext = '' % r.xdid + pubidtext = '' % r['xdid'] pubidtext += '' - pubidtext += html.mkhref(r.xdid, '/pub/' + r.xdid) - c_grids[r.Date] = { - 'link' : '/pub/%s%s/index.html#' % (pubid, year) + r.xdid, - 'class': get_cell_classes(r), + pubidtext += html.mkhref(r['xdid'], '/pub/' + r['xdid']) + c_grids[r['Date']] = { + 'link' : '/pub/%s%s/index.html#' % (pubid, year) + r['xdid'], + 'class': get_cell_classes(r), 'title': br_with_n(similar_text), } row_dict['tag_params'] = { - 'onclick': 'location.href=\'/pub/%s\'' % r.xdid, + 'onclick': 'location.href=\'/pub/%s\'' % r['xdid'], 'class': 'puzzlehl' } else: - pubidtext = r.xdid + pubidtext = r['xdid'] row_dict['class'] = 'puzzle' - - row = [ + continue # don't display unique puzzles in table; refer to download/google sheet + + row = [ pubidtext, - r.Date, - r.Size, - r.Title, - r.Author, - r.Editor, - r.Copyright, - r.A1_D1, + r['Date'], + r['Size'], + r['Title'], + r['Author'], + r['Editor'], + r['Copyright'], + r['A1_D1'], reused_clue_pct, similar_text ] @@ -206,17 +206,14 @@ def get_cell_classes(r): outf.write_row('pub/%s%s.tsv' % (pubid, year), " ".join(pubyear_header), row) row_dict['row'] = row rows.append(row_dict) - + pub_grids[pubid][year] = c_grids - # Generate calendar + # Generate calendar onepubyear_html = GridCalendar(c_grids).formatyear(year, 6) + "
" - # Generate html table sorted by 2nd element of row (date) - onepubyear_html += html.html_table(sorted(rows , key=lambda x: x['row'][1]), pubyear_header, "puzzle", "puzzles") + onepubyear_html += html.html_table(sorted(rows, key=lambda x: x['row'][1]), pubyear_header, "puzzle", "puzzles") outf.write_html("pub/%s%s/index.html" % (pubid, year), onepubyear_html, title="%s %s" % (pubid, year)) - - cluepct = "" wordpct = "" if total_clues: diff --git a/scripts/33-mkwww-words.py b/scripts/33-mkwww-words.py index a5268fe..6eca1e2 100755 --- a/scripts/33-mkwww-words.py +++ b/scripts/33-mkwww-words.py @@ -3,7 +3,7 @@ from queries.similarity import find_similar_to, find_clue_variants, load_clues, load_answers from xdfile.utils import get_args, open_output, find_files, log, debug, get_log, COLUMN_SEPARATOR, EOL, parse_tsv, progress, parse_pathname from xdfile.html import th, td, mkhref, html_select_options -from xdfile import corpus, clues, pubyear, metadatabase, utils +from xdfile import corpus, clues, pubyear, metadatabase, utils, metasql from collections import Counter import random @@ -13,8 +13,8 @@ def xd_metadata_row(xdid): if not g_puzzles_md: - for r in metadatabase.xd_puzzles().values(): - g_puzzles_md[r.xdid] = r + for r in metasql.select('SELECT * FROM puzzles;'): + g_puzzles_md[r['xdid']] = r return g_puzzles_md[xdid] @@ -31,18 +31,17 @@ def mkwww_wordpage(answer): for ca in sorted(uses, reverse=True, key=lambda ca: ca.date): try: md = xd_metadata_row(ca.xdid()) - h += td(md.xdid, ca.clue, md.Author, md.Copyright) + h += td(md['xdid'], ca.clue, md['Author'], md['Copyright']) except Exception as e: - h += td(ca.xdid(), ca.clue, str(e)) + h += td(ca.xdid, ca.clue, str(e)) if utils.get_args().debug: raise h += '' # h += '
' -# h += '
Mutations: ' +# h += '
Mutations: ' # h +='
' - - return h + return h def main(): @@ -60,7 +59,6 @@ def main(): h = '
  • %d different words
  • ' % len(all_uses) h += '

    Most used words

    ' - h += '' h += th("word", "# uses", "clues used with this answer") @@ -78,5 +76,4 @@ def main(): outf.write_html('pub/word/%s/index.html' % word.upper(), mkwww_wordpage(word), title=word) outf.write_html('pub/word/index.html', h, title="Words") - main() diff --git a/scripts/37-pubyear-svg.py b/scripts/37-pubyear-svg.py new file mode 100755 index 0000000..7937524 --- /dev/null +++ b/scripts/37-pubyear-svg.py @@ -0,0 +1,286 @@ +#!/usr/bin/python3 + +import re +import operator + +from datetime import date +from xdfile import utils +from xdfile import metasql as metadb +from xdfile import html +from xdfile.utils import space_with_nbsp +import xdfile +from collections import defaultdict, OrderedDict + + +svg_w = 32 +svg_h = 35 +decade_scale=1.3 + +pys = ''' + + + + +{body} + +''' + +legend = ''' +Broken out by day-of-week (Monday at top, Sunday at bottom). Thicker lines mean larger puzzles. +
    + + + + + + +
      50%+ grid match of an earlier puzzle, same author (reprint/resubmission)
      30-50% grid match of an earlier puzzle (likely theme copy)
      50%+ grid match of an earlier puzzle, different author (suspicious)

      crosswords available for public download
      crosswords currently not publicly available
    +
    +''' + + +def rect(x, y, w, h, *classes): + return '\n'.format(x=int(x), y=int(y), w=int(w), h=int(h), classes=''.join(classes)) + + +def year_from(dt): + return int(dt.split('-')[0]) + + +def weekdays_between(dta, dtb): + return 0 + + +def pubyear_svg(rows, height=svg_h, width=svg_w, pubid='', year=''): #, nsusp, ndup, npub, npriv): + bgclass = "notexists" +# if bgclass not in publications.tsv: +# bgclass = "exists" + + rects = '' + """ + pubid CHAR(6), -- "nyt" + year CHAR(4), -- "2006" + weekday CHAR(3), -- "Mon" + Size TEXT, -- most common entry + Editor TEXT, -- most common entry + Copyright TEXT, -- most common, after removing Date/Author + NumExisting INTEGER, -- known or assumed to be in existence (0 means unknown) + NumXd INTEGER, -- total number in xd + NumPublic INTEGER, -- available for public download + -- duplicate grids, same author + NumReprints INTEGER, -- 100% grid match + NumTouchups INTEGER, -- 75-99% grid match + NumRedone INTEGER, -- 30-75% grid match + -- duplicate grids, different author + NumSuspicious INTEGER, -- >50% similar grid + NumThemeCopies INTEGER -- >50% similar grid + """ + row = rows[0] + svgtitle = '{} {}\n'.format(row['pubid'], row['year']) + svgtitle += 'Copyright: {}\n'.format(row['Copyright']) if row['Copyright'] else '' + svgtitle += 'Editor: {}'.format(row['Editor']) if row['Editor'] else '' + + for i, wd in enumerate(utils.WEEKDAYS): #range(0, 7): + row = rows[i] + y = i*2 + 2 + num_existing = 52 if 's' not in year else 520 # (eventually number of this weekday in that year, *10 for decades) + + num_xd = row["NumXd"] + + #dup_length is length of dup/orange line + num_dup = row['NumReprints'] + row['NumTouchups'] + row['NumRedone'] + + # susp_length is length of suspicious/red line + num_susp = row['NumSuspicious'] + num_theme = row['NumThemeCopies'] + # TODO: base color on suspicious vs theme (darker when only suspicious) + + num_pub = row['NumPublic'] + + num_priv = num_xd - num_pub + + pixel_prexd = 0 + pixel_postxd = 0 + if num_xd < num_existing: + # for now; eventually should use earliest/latest date and puzzle to determine which side has gap + # npre = weekdays_between(date(year_from(firstxd.Date), 1, 1), firstxd.Date, i) + # npost = weekdays_between(lastxd.Date, date(year_from(lastxd.Date), 12, 31), i) + pixel_prexd = 1 + pixel_postxd = 1 + + if not num_xd or not num_existing: + continue + + pixel_total = width - pixel_prexd - pixel_postxd + + if num_xd <= num_existing: + pixel_xd = pixel_total * num_xd / num_existing + else: + pixel_xd = pixel_total + + # then convert num_* to pixel_*, num_existing to pixel_total + pixel_susp = num_susp*pixel_xd/num_xd + pixel_theme = num_theme*pixel_xd/num_xd + pixel_dup = num_dup*pixel_xd/num_xd + pixel_pub = num_pub*pixel_xd/num_xd + pixel_priv = num_priv*pixel_xd/num_xd + + if pixel_theme > 0 and pixel_theme < 1: + pixel_theme = 1 + if pixel_susp > 0 and pixel_susp < 1: + pixel_susp = 1 + if pixel_dup > 0 and pixel_dup < 1: + pixel_dup = 1 + + m = re.match(r'(\d+?)x(\d+?).*', row['Size']) + if m: + sz = int(m.group(1)) * int(m.group(2)) + if sz > 17*17: + h = 4 + else: + h = 1.5 + else: + h = 1 + + x = 0 + w = 6 + rects += ''''''.format(utils.WEEKDAYS[i],y=int(y)) + + w = pixel_prexd +# rects += rect(x, y, w, h, 'prexd') + x += w + + w = pixel_susp + rects += rect(x, y, w, h, 'suspxd') + x += w + + w = pixel_theme + rects += rect(x, y, w, h, 'themexd') + x += w + + w = pixel_dup + rects += rect(x, y, w, h, 'dupxd') + x += w + + if x <= pixel_total: + w = min(pixel_total - x, max(0, pixel_priv)) + rects += rect(x, y, w, h, 'privxd') + x += w + + if x <= pixel_total: + w = min(pixel_total - x, max(0, pixel_pub)) + rects += rect(x, y, w, h, 'pubxd') + x += w + +# w = pixel_postxd +# rects += rect(x, y, w, h, 'postxd') + rects += '' + href = "/pub/%s%s" % (pubid, year) if 's' not in year else "/pub/%s/index.html#%s" % (pubid, year[:-1]) + ret = html.mkhref(pys.format(w=width,h=height,classes=bgclass,body=rects), href, svgtitle) + return ret + + +def main(): + p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus") + p.add_argument('-a', '--all', default=False, help='analyze all puzzles, even those already in similar.tsv') + args = utils.get_args(parser=p) + outf = utils.open_output() + + pubyears = defaultdict(list) + pubyears_idx = defaultdict(list) + # years_idx = [] + for r in metadb.select("SELECT * FROM stats"): + y = r['year'] or '9999' + pubyear = r['pubid'] + y + pubyears[pubyear].append(r) + if y not in pubyears_idx[r['pubid']]: + pubyears_idx[r['pubid']].append(y) + # if r['year'] not in years_idx: + # years_idx.append(r['year']) + + # Making collapsed decades depends on args + skip_decades = None + skip_decades = skip_decades if skip_decades else { 'start': 1910, 'end': 1980 } + allyears = [] + for i in range(skip_decades['start']//10, skip_decades['end']//10 + 1): + allyears.append("%s0s" % i) + allyears.extend([ str(y) for y in range(skip_decades['end'] + 10, date.today().year + 1) ]) + + html_out = [] + html_out.append(legend) + html_out.append('') + + # Table header with years \ decades + year_header = [] + year_header.append('') + for year in sorted(allyears): + if year[-1] == 's': + lead = '' + yclass = 'decade' + elif year[3] == '0': + lead = year[:2] + yclass = 'zero-year' + else: + lead = ' ' + yclass = 'ord-year' + year_header.append(''.format(yclass, lead, year[2:])) + year_header.append('') + html_out.extend(year_header) + + sorted_idx = OrderedDict(sorted(pubyears_idx.items(), key=lambda r: min(r[1]))) + for pub in sorted_idx: + # Process each pub in index + pubobj = metadb.xd_publications().get(pub) + pubname = pubobj.PublicationName if pubobj else '' + html_out.append(''.format(html.mkhref((pubname or pub), pub))) + for year in sorted(allyears): + py = pub + year + py_svg = None + html_out.append('') + # Add publishers + html_out.append(''.format(html.mkhref((pubname or pub), pub))) + html_out.append('') + + html_out.extend(year_header) + html_out.append('
     {}
    {}
    {}') + if 's' not in year: + # Process for single year + if py in pubyears: + py_svg = pubyear_svg(pubyears[py],pubid=pub,year=year) + else: + # Process for decade + decade = [] + row_id = ['NumXd', 'NumReprints', 'NumTouchups', 'NumRedone', 'NumSuspicious', 'NumThemeCopies', 'NumPublic'] + for wdi, wd in enumerate(utils.WEEKDAYS): + wd_dict = {} + wd_dict['weekday'] = wd + wd_dict['pubid'] = pub + wd_dict['year'] = year + wd_dict['Copyright'] = '' + wd_dict['Editor'] = '' + wd_dict['Size'] = '' + for dec_year in [year[:3]+str(y) for y in range(0,10)]: + for rid in row_id: + if pubyears[pub+dec_year]: + if rid in wd_dict: + wd_dict[rid] += pubyears[pub+dec_year][wdi][rid] + else: + wd_dict[rid] = pubyears[pub+dec_year][wdi][rid] + # Emulate 7 rows per decade + if row_id[0] in wd_dict: + decade.append(wd_dict) + py_svg = pubyear_svg(decade, width=svg_w*decade_scale,year=year,pubid=pub) if decade else None + + if py_svg: + html_out.append(py_svg) + else: + width = svg_w if 's' not in year else svg_w*decade_scale + html_out.append(pys.format(w=width,h=svg_h, title='', classes='notexists', body='')) + + html_out.append('{}
    ') + outf.write_html('pub/index.html', "".join(html_out), "Published crosswords by year") + + +if __name__ == "__main__": + main() + diff --git a/scripts/40-deploy.sh b/scripts/40-deploy.sh index 901ccbe..159bdb4 100755 --- a/scripts/40-deploy.sh +++ b/scripts/40-deploy.sh @@ -17,4 +17,3 @@ aws s3 sync --region $REGION $WWW ${S3WWW}/ --acl public-read ALLLOGS=$WWW/log/$TODAY-logs.txt scripts/49-cat-logs.py -o $ALLLOGS $PUB $TMP aws s3 cp --region $REGION $ALLLOGS ${S3WWW}/logs/ --acl public-read - diff --git a/scripts/41-git-commit.sh b/scripts/41-git-commit.sh index e983880..5f152d2 100755 --- a/scripts/41-git-commit.sh +++ b/scripts/41-git-commit.sh @@ -34,6 +34,7 @@ else echo "SUMMARY: Commiting into master" git add . git commit -m "incoming for $TODAY" + ssh-agent bash -c "ssh-add ${SSHHOME}/.ssh/gxd_rsa; git push" fi diff --git a/scripts/49-cat-logs.py b/scripts/49-cat-logs.py index 9dc6b8b..4ebe438 100755 --- a/scripts/49-cat-logs.py +++ b/scripts/49-cat-logs.py @@ -6,7 +6,8 @@ # concatenates .log files (even those in subdirs or .zip) and combines into a single combined.log from xdfile.utils import find_files_with_time, open_output, get_args -from boto.s3.connection import S3Connection +import boto3 +# from boto.s3.connection import S3Connection import os @@ -14,15 +15,15 @@ def main(): args = get_args('aggregates all .log files') outf = open_output() - print(os.environ['AWS_ACCESS_KEY'],os.environ['AWS_SECRET_KEY']) - conn = S3Connection(aws_access_key_id=os.environ['AWS_ACCESS_KEY'], aws_secret_access_key=os.environ['AWS_SECRET_KEY']) - print(conn) - s3path = "s3://" + os.environ['BUCKET'] + "/logs/" - bucket = conn.get_bucket(s3path) - print(bucket, s3path) - for key in sorted(bucket.list(), key=lambda x: x.last_modified): + s3 = boto3.resource('s3') + s3path = "logs/" + # bucket = conn.get_bucket(s3path) + bucket = s3.Bucket(os.environ['BUCKET']) + + for obj in sorted(bucket.objects.all(), key=lambda x: x.last_modified): # last_modified - print("Name: %s LastModified:%s" % (key.name.encode('utf-8'), key.last_modified)) + if s3path in obj.key: + print("Name: %s LastModified:%s" % (obj.key.encode('utf-8'), obj.last_modified)) for fn, contents, dt in sorted(find_files_with_time(*args.inputs, ext=".log"), key=lambda x: x[2]): # earliest first outf.write_file(fn, contents.decode("utf-8")) diff --git a/scripts/html/style.css b/scripts/html/style.css index 5c0e30e..d2888d4 100644 --- a/scripts/html/style.css +++ b/scripts/html/style.css @@ -1,5 +1,57 @@ -.year_widget rect { - fill:green; +#pubyearmap td { + border-right: 1px solid grey; + border-bottom: 1px solid grey; + text-align: center; +} + +#pubyearmap td.header { + font-weight: bold; + font-size: 12px; + width: 60px; +} + +#pubyearmap td.year_widget { + background: #a8a8a8; +} + +#pubyearmap td.decade { + background: lightgrey; + color: black; +/* border-color: white; */ + font-weight: bold; + font-size: 12px; +} + +#pubyearmap td.zero-year { + background: lightgrey; + color: black; +/* border-color: white; */ + font-weight: bold; + font-size: 12px; +} + +#pubyearmap td.ord-year { + background: #ccf; + color: black; +/* border-color: white; */ + font-size: 12px; +} + +td:hover { + background:darkgray; +} + +th { + font-weight: bold; + font-size: 12px; +} + +.year_widget { + fill:#a8a8a8; +} + +.year_widget { + overflow:visible; } .year_widget rect.red { @@ -14,10 +66,40 @@ fill:white; } -.year_widget rect.similar10 { +.similar10 { fill:lightsalmon; } +/* Prexd */ +.notexist { + fill: white; +} + +.suspxd { + fill:darkred; + background-color: darkred; +} + +.dupxd { + fill:yellow; + background-color: yellow; +} + +.themexd { + fill:orange; + background-color: orange; +} + +.privxd { + fill:blue; + background-color: blue; +} + +.pubxd { + fill:green; + background-color: green; +} + .year_widget text { fill: black; font: 10px sans-serif; diff --git a/scripts/meta.sql b/scripts/meta.sql index 5020264..4c139fd 100644 --- a/scripts/meta.sql +++ b/scripts/meta.sql @@ -1,11 +1,75 @@ -CREATE TABLE "receipts" ( - "CaptureTime" TEXT, - "ReceivedTime" TEXT, - "ExternalSource" TEXT, - "InternalSource" TEXT, - "SourceFilename" TEXT, - "xdid" TEXT +-- xd + +CREATE TABLE receipts ( + CaptureTime TEXT, + ReceivedTime TEXT, + ExternalSource TEXT, + InternalSource TEXT, + SourceFilename TEXT, + xdid CHAR(16), + PRIMARY KEY (ExternalSource, SourceFilename) +); + +CREATE INDEX XDID on receipts (xdid ASC); + + +CREATE TABLE similar_grids ( + xdid CHAR(16), + xdidMatch CHAR(16), + GridMatchPct INTEGER +); + + +CREATE TABLE similar_clues ( + xdid CHAR(16), + reused_clues INTEGER, + reused_answers INTEGER, + total_clues INTEGER ); -CREATE INDEX "XDID" on receipts (xdid ASC); + +CREATE TABLE publications ( + PublicationAbbr CHAR(8), + PublisherAbbr CHAR(8), + PublicationName TEXT, + PublisherName TEXT, + FirstIssueDate CHAR(10), + LastIssueDate CHAR(10), + NumberIssued INTEGER, + Contact TEXT, + Sources TEXT +); + + +CREATE TABLE puzzles ( + xdid CHAR(16), -- "eltana-001" + Date CHAR(10), -- "2016-07-18" + Size CHAR(8), -- "15x15RS" (Rebus/Special) + Title TEXT, + Author TEXT, + Editor TEXT, + Copyright TEXT, + A1_D1 TEXT +); + + +-- grouped by pub-year-weekday +CREATE TABLE stats ( + pubid CHAR(6), -- "nyt" + year CHAR(4), -- "2006" + weekday CHAR(3), -- "Mon" + Size TEXT, -- most common entry + Editor TEXT, -- most common entry + Copyright TEXT, -- most common, after removing Date/Author + NumExisting INTEGER, -- known or assumed to be in existence (0 means unknown) + NumXd INTEGER, -- total number in xd + NumPublic INTEGER, -- available for public download + -- duplicate grids, same author + NumReprints INTEGER, -- 100% grid match + NumTouchups INTEGER, -- 75-99% grid match + NumRedone INTEGER, -- 30-75% grid match + -- duplicate grids, different author + NumSuspicious INTEGER, -- >50% similar grid + NumThemeCopies INTEGER -- 30-50% similar grid +); diff --git a/scripts/tsv2sqlite.py b/scripts/tsv2sqlite.py index 86b9deb..6ca814f 100755 --- a/scripts/tsv2sqlite.py +++ b/scripts/tsv2sqlite.py @@ -11,17 +11,46 @@ from xdfile import metadatabase as metadb +# Map tsvtype and sql table +sqlmap = { + 'Receipt' : 'receipts', + 'Publication' : 'publications', +} + + def main(): p = args_parser('convert .tsv to sqlite') + p.add_argument('--tsvtype', default=None, help='Tsv file type to import') args = get_args(parser=p) - sqlconn = sqlite3.connect(args.output) - cur = sqlconn.cursor() + if args.tsvtype is not None: + # Process only if tsvtype supplied + sqlconn = sqlite3.connect(args.output) + cur = sqlconn.cursor() + rows = [list(r) for r in xdfile.utils.parse_tsv_rows(args.inputs[0], args.tsvtype)] + + if args.tsvtype == 'Similar': + # Fill up similar clues first + sclues = [[x[0], x[2], x[3], x[4]] for x in rows] + INS_TMPL = ",".join('?' * len(sclues[0])) + cur.executemany('INSERT INTO %s VALUES (%s)' % ('similar_clues', INS_TMPL), sclues) + # Fill up similar grids + sgrids = [] + for r in rows: + if '=' in r[5]: + for pos in r[5].split(' '): + (xdidm, pctm) = pos.split('=') + sgrids.append([r[0], xdidm, int(pctm)]) + + INS_TMPL = ",".join('?' * len(sgrids[0])) + cur.executemany('INSERT INTO %s VALUES (%s)' % ('similar_grids', INS_TMPL), sgrids) + else: + rows = [list(r) for r in xdfile.utils.parse_tsv_rows(args.inputs[0], args.tsvtype)] - rows = [list(r) for r in xdfile.utils.parse_tsv_rows(args.inputs[0], "Receipt")] - info("Rows to be inserted to sql: %s" % len(rows)) - cur.executemany('INSERT INTO receipts VALUES (?,?,?,?,?,?)', rows) - sqlconn.commit() + info("Rows to be inserted to sql table [ %s ]: %s" % (sqlmap[args.tsvtype], len(rows))) + INS_TMPL = ",".join('?' * len(rows[0])) + cur.executemany('INSERT OR IGNORE INTO %s VALUES (%s)' % (sqlmap[args.tsvtype], INS_TMPL), rows) + sqlconn.commit() if __name__ == "__main__": main() diff --git a/xdfile/cloud.py b/xdfile/cloud.py index 77cd1b4..117e20c 100644 --- a/xdfile/cloud.py +++ b/xdfile/cloud.py @@ -1,6 +1,6 @@ import boto3 -from xdfile.utils import log, info +from xdfile.utils import log, info, debug, error def xd_send_email(destaddr, fromaddr='admin@xd.saul.pw', subject='', body=''): client = boto3.client('ses') diff --git a/xdfile/metasql.py b/xdfile/metasql.py index 60c71bc..6b8c049 100644 --- a/xdfile/metasql.py +++ b/xdfile/metasql.py @@ -13,6 +13,7 @@ METADB = "meta.db" # SQLLite database METADB_RECEIPTS = "receipts" # Receipts table +METADB_PUZZLES = 'puzzles' # Puzzles table RECEIPTS_TSV = "gxd/receipts.tsv" SIMILAR_TSV = "gxd/similar.tsv" @@ -101,7 +102,12 @@ def xd_publications(): @utils.memoize def xd_puzzles(): - return utils.parse_tsv(PUZZLES_TSV, "Puzzle") + cursor.execute('SELECT * FROM %s' % (METADB_PUZZLES)) + for c in cursor.fetchall(): + print("C: %s" % c) + + + #return utils.parse_tsv(PUZZLES_TSV, "Puzzle") @utils.memoize def xd_similar(): @@ -135,12 +141,25 @@ def append_row(tsvpath, headerstr, row, to_sql=False): fp.write(COLSEP.join([str(x) for x in row]) + EOL) fp.close() else: + # tsvpath = SQL table cur = sqlconn.cursor() - INS_TMPL = ",".join('?' * len(COLSEP.split(headerstr))) - cur.execute("INSERT INTO %s VALUES (%s)" % (METADB_RECEIPTS, INS_TMPL),([str(x) for x in row])) + INS_TMPL = ",".join('?' * len(row)) + cur.execute("INSERT INTO %s VALUES (%s)" % (tsvpath, INS_TMPL),([str(x) for x in row])) sqlconn.commit() +def select(query, *args): + # Execute SQL statement w/o commit + cursor.execute(query, *args) + return cursor.fetchall() + +def execute(query, *args): + # Execute SQL statement with commit + cursor.execute(query, *args) + sqlconn.commit() + # return cursor.fetchall() + + def check_already_recieved(extsrc, srcfn): cursor.execute('SELECT * FROM %s WHERE ExternalSource="%s" AND SourceFilename="%s"' % (METADB_RECEIPTS, extsrc, srcfn)) return [ i for i in map(xd_receipt._make, cursor.fetchall())] @@ -177,12 +196,32 @@ def xd_sources_row(SourceFilename, ExternalSource, DownloadTime): def xd_recent_download(pubid, dt): return COLSEP.join([ pubid, dt ]) + EOL +def append_puzzles(puzzles): + tmplist = [] + for xd in puzzles: + fields = [ + xd.xdid(), # xdid + xd.get_header("Date"), + "%dx%d%s%s" % (xd.width(), xd.height(), xd.get_header("Rebus") and "R" or "", xd.get_header("Special") and "S" or ""), + xd.get_header("Title"), + xd.get_header("Author") or xd.get_header("Creator"), + xd.get_header("Editor"), + xd.get_header("Copyright"), + "%s_%s" % (xd.get_answer("A1"), xd.get_answer("D1")) + ] + tmplist.append(fields) + + INS_TMPL = ",".join('?' * len(tmplist[0])) + utils.debug('Going to insert %s rows' % len(tmplist)) + cursor.executemany("INSERT INTO %s VALUES (%s)" % (METADB_PUZZLES, INS_TMPL), tmplist) + sqlconn.commit() def update_puzzles_row(xd): # INSERT only for now if xd.xdid() in xd_puzzles(): raise Error('record already exists; UPDATE not implemented') + #print("XD: %s" % xd) fields = [ xd.xdid(), # xdid xd.get_header("Date"), @@ -195,9 +234,10 @@ def update_puzzles_row(xd): "%s_%s" % (xd.get_answer("A1"), xd.get_answer("D1")) ] - assert COLSEP not in "".join(fields), fields + # print(fields) + # assert COLSEP not in "".join(fields), fields - append_row(PUZZLES_TSV, xd_puzzles_header, fields) + append_row(METADB_PUZZLES, xd_puzzles_header, fields, to_sql=True) class Publication: diff --git a/xdfile/pubyear.py b/xdfile/pubyear.py index d72eb50..52bcd99 100644 --- a/xdfile/pubyear.py +++ b/xdfile/pubyear.py @@ -1,10 +1,8 @@ -import cgi -from collections import Counter, defaultdict +from collections import defaultdict -from xdfile.html import th, td, mkhref, mktag, tr_empty, td_with_class, year_widget, decade_widget +from xdfile.html import mkhref, mktag, tr_empty, td_with_class from xdfile import utils, metadatabase as metadb from xdfile.utils import space_with_nbsp -import xdfile from datetime import date @@ -18,13 +16,13 @@ def mkcell(text, href="", title=""): def split_year(y): lsy = str(y)[2:] if y[3] != '0': - #msy = ' ' # unicode M space - msy = ' ' # Changed to   + msy = ' ' # Changed to   else: msy = str(y)[:2] return "%s
    %s" % (msy, lsy) + def get_pubheader_classes(*years): """ Assign classes to years header @@ -34,9 +32,34 @@ def get_pubheader_classes(*years): if " " in str(y): classes.append("ord-year") else: - classes.append("zero-year") + classes.append("zero-year") return classes - + + +def year_widget(dow_dict, total, fill_class=None): + # Generate SVG based widget for day of week dispersion for year + fill_class = fill_class or 'white' + b = [] + b.append('') + b.append('' % fill_class) + for i, v in enumerate(utils.WEEKDAYS): + _class = dow_dict[v]['class'] if 'class' in dow_dict[v].keys() else '' + _length = str(dow_dict[v]['count']) if 'count' in dow_dict[v].keys() else '0' + _length = _length if int(_length) < 26 else '30' # for all 52/2 have full filled row + b.append('') + b.append('') + return(' '.join(b)) + +def decade_widget(total, fill_class=None): + # Generate SVG based widget for decade showing total + fill_class = fill_class or 'green' + b = [] + b.append('') + b.append('' % fill_class) + b.append('' + str(total) + '') + b.append('') + return(' '.join(b)) + g_all_pubyears = None def pubyear_html(pubyears=[], skip_decades=None): @@ -47,9 +70,9 @@ def pubyear_html(pubyears=[], skip_decades=None): if not g_all_pubyears: g_all_pubyears = utils.parse_tsv_data(open("pub/pubyears.tsv").read(), "pubyear") - + # Read similars to make background of widgets - similar_d = defaultdict(dict) + similar_d = defaultdict(dict) for xdid, v in utils.parse_tsv('gxd/similar.tsv', "similar").items(): xd_split = utils.split_xdid(xdid) if xd_split: @@ -61,7 +84,7 @@ def pubyear_html(pubyears=[], skip_decades=None): b = [] # Body - # Making collapsed decaded depends on args + # Making collapsed decades depends on args skip_decades = skip_decades if skip_decades else { 'start': 1910, 'end': 1970 } allyears = [] for i in range(skip_decades['start']//10, skip_decades['end']//10 + 1): @@ -108,25 +131,25 @@ def pubyear_html(pubyears=[], skip_decades=None): 'hint': hint, 'total': int(total), } - + # main table b.append('') yhdr = [ ' ' ] + [ split_year(y) for y in allyears ] yhdr.append("all") b.append(td_with_class(*yhdr, classes=get_pubheader_classes(*yhdr), rowclass="pubyearhead",tag="th")) - b.append(tr_empty()) - - # Process each pubid sorted by earliest year + b.append(tr_empty()) + + # Process each pubid sorted by earliest year for pubid in sorted(pubs, key=lambda x:min(pubs[x])): pub = metadb.xd_publications().get(pubid) pubname = pub.PublicationName if pub else '' - # Pub id to first column + # Pub id to first column b.append(mktag('tr')) b.append(mktag('td','pub')) b.append(mkcell(space_with_nbsp(pubname or pubid), "/pub/" + pubid, )) b.append(mktag('/td')) - + # Process each year not collapsed into decade for yi in allyears: if yi in pubs[pubid] and pubs[pubid][yi]['total'] > 0: @@ -140,11 +163,11 @@ def pubyear_html(pubyears=[], skip_decades=None): b.append(mktag('td', 'block')) b.append(' ') b.append(mktag('/td')) - + b.append(mktag('td')) b.append(str(sum([ pubs[pubid][x]['total'] for x in pubs[pubid].keys() ]))) b.append(mktag('/td')) b.append(mktag('/tr')) - + b.append(mktag('/table')) return (" ".join(b)) diff --git a/xdfile/xdfile.py b/xdfile/xdfile.py index 98adba8..7a50801 100755 --- a/xdfile/xdfile.py +++ b/xdfile/xdfile.py @@ -77,6 +77,9 @@ def height(self): def size(self): return (self.width(), self.height()) + def sizestr(self): + return "%dx%d%s%s" % (self.width(), self.height(), self.get_header("Rebus") and "R" or "", self.get_header("Special") and "S" or "") + def xdid(self): num = self.get_header("Number") if num: @@ -491,7 +494,7 @@ def get_xd(xdid): try: xd = xdfile(corpus_contents()[xdid].decode("utf-8"), xdid) except Exception as e: - error("get_xd() %s" % str(e)) + # error("get_xd() %s" % str(e)) return None return xd