Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Az error verbosity #27

Merged
merged 4 commits into from
Aug 16, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions queries/remix.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#!/usr/bin/env python3

from xdfile.utils import get_args, open_output, find_files, log, debug, get_log, COLUMN_SEPARATOR, EOL, parse_tsv, progress, parse_pathname
from xdfile.utils import get_args, open_output, find_files, log, debug, info, error, get_log, COLUMN_SEPARATOR, EOL
from xdfile.utils import parse_tsv, progress, parse_pathname
from xdfile import corpus, xdfile, BLOCK_CHAR


# for a given grid
# for all words,
# for all words,
# show how many distinct clues there are per publication

# for each pub that has clues for all words,
Expand Down Expand Up @@ -87,7 +88,7 @@ def mutate(xd, words, chance=1):
if random.random() < chance:
nmutations += 1
xd.grid[r] = splice(xd.grid[r], c, best_replacement)
log("-> %s/%s (%s)" % (new_hwd, new_vwd, "".join(br for h, v, br in mutations_this_square)))
info("-> %s/%s (%s)" % (new_hwd, new_vwd, "".join(br for h, v, br in mutations_this_square)))
return nmutations


Expand Down Expand Up @@ -179,7 +180,7 @@ def main():
while nmutated < 100:
nmutated += mutate(xd, pub_clues)
nmissing = reclue(xd, pub_clues)
log("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated))
info("%s missing %d clues after %d mutations" % (outfn, nmissing, nmutated))

remixed.add(pubid)
outf.write_file(outfn, xd.to_unicode())
Expand All @@ -189,14 +190,14 @@ def main():
missing_tsv += COLUMN_SEPARATOR.join([ xd.xdid(), pubid, str(nmissing) ]) + EOL

except Exception as e:
log("remix error %s" % str(e))
error("remix error %s" % str(e))

if remixed:
log("%d remixed: %s" % (len(remixed), " ".join(remixed)))
info("%d remixed: %s" % (len(remixed), " ".join(remixed)))
try:
outf.write_file(parse_pathname(fn).base + ".xd", contents.encode("utf-8"))
except Exception as e:
log("couldn't write: " + str(e))
error("couldn't write: " + str(e))

outf.write_file("remix.log", get_log().encode("utf-8"))
outf.write_file("remix.tsv", missing_tsv)
Expand Down
8 changes: 6 additions & 2 deletions scripts/00-aws-bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ chmod 600 $SSHHOME/.ssh/gxd_rsa
cat src/aws/ssh_config >> $SSHHOME/.ssh/config
ssh-agent bash -c "ssh-add $SSHHOME/.ssh/gxd_rsa; git clone ${GXD_GIT}"

# Import all .tsv to sql
echo "Import all .tsv to sql"
scripts/05-sql-import-receipts.sh

echo "Run deploy script"
Expand All @@ -59,11 +59,15 @@ echo 'SUMMARY: End time '`date +'%Y-%m-%d %H:%M'`
egrep -i 'ERROR|WARNING|SUMMARY' ${LOGFILE} > ${SUMLOGFILE}
echo -e '\n' >> ${SUMLOGFILE}

echo "Getting summary"
scripts/48-stats.sh >> ${SUMLOGFILE}
echo -e '\n' >> ${SUMLOGFILE}

echo "SUMMARY: Full log file http://$BUCKET/logs/`basename ${LOGFILE}`"
echo "SUMMARY: Full log file http://$BUCKET/logs/`basename ${LOGFILE}`" >> ${SUMLOGFILE}

echo "Sending email"
scripts/send-email.py $ADMIN_EMAIL "execution logs for $TODAY" ${SUMLOGFILE}

echo "Copy logs to AWS"
aws s3 cp --region ${REGION} ${LOGFILE} s3://${BUCKET}/logs/ --acl public-read
aws s3 cp --region ${REGION} ${SUMLOGFILE} s3://${BUCKET}/logs/ --acl public-read
9 changes: 5 additions & 4 deletions scripts/09-collection2zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@
import zipfile

from xdfile.metadatabase import xd_sources_row, xd_sources_header
from xdfile.utils import find_files_with_time, get_log, get_args, filetime, args_parser, parse_pathname, log, iso8601, open_output, strip_toplevel
from xdfile.utils import find_files_with_time, get_log, get_args, filetime, args_parser, parse_pathname
from xdfile.utils import log, info, iso8601, open_output, strip_toplevel


def main():
p = args_parser('catalog source files and create source.tsv')
p.add_argument('-s', '--source', default=None, help='ExternalSource')
args = get_args(parser=p)

log("importing from %s" % args.source)
info("importing from %s" % args.source)

outf = open_output()

Expand All @@ -25,14 +26,14 @@ def main():
for input_source in args.inputs:
for fn, contents, dt in find_files_with_time(input_source):
if len(contents) == 0:
log("ignoring empty file")
info("ignoring empty file")
continue

outf.write_file(strip_toplevel(fn), contents, dt)

sources.append(xd_sources_row(fn, args.source or input_source, iso8601(dt)))

log("%s files cataloged" % len(sources))
info("%s files cataloged" % len(sources))

outbase = parse_pathname(args.output).base

Expand Down
4 changes: 2 additions & 2 deletions scripts/12-parse-email.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

from xdfile.utils import open_output, log, find_files, get_args, parse_pathname, generate_zip_files, iso8601, to_timet
from xdfile.utils import open_output, info, log, find_files, get_args, parse_pathname, generate_zip_files, iso8601, to_timet
from xdfile.metadatabase import xd_sources_header, xd_sources_row
from xdfile.cloud import xd_send_email

Expand Down Expand Up @@ -53,7 +53,7 @@ def main():
for puzfn, puzdata, puzdt in email_files:
# a basic sanity check of filesize
# accommodate small puzzles and .pdf
log("%s: %s from %s" % (puzfn, iso8601(puzdt), upload_src))
info("%s: %s from %s" % (puzfn, iso8601(puzdt), upload_src))

summary("%s puzzles from %s" % (len(email_files), upload_src))

Expand Down
4 changes: 2 additions & 2 deletions scripts/19-reshelve.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ def main():
seqnum = utils.parse_seqnum(r.xdid or r.SourceFilename)
if seqnum:
newxdid = newpubid + seqnum
utils.log("changing xdid from '%s' to '%s'" % (r.xdid, newxdid))
utils.info("changing xdid from '%s' to '%s'" % (r.xdid, newxdid))
d["xdid"] = newxdid
else:
utils.log("no date or number in xdid, not reshelving")
utils.info("no date or number in xdid, not reshelving")

all_receipts += metadb.xd_receipts_row(**d)

Expand Down
9 changes: 4 additions & 5 deletions scripts/21-clean-metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Usage: $0 [-o <puzzles.tsv>] <input>
#
# Generates puzzles.tsv with cleaned metadata for each .xd in <input>.
# Generates puzzles.tsv with cleaned metadata for each .xd in <input>.
#

from xdfile import utils, metadatabase as metadb
Expand Down Expand Up @@ -96,7 +96,7 @@ def clean_headers(xd):
xd.set_header(hdr, None)
else:
if hdr.lower() not in xdfile.HEADER_ORDER:
utils.log("%s: '%s' header not known: '%s'" % (xd.filename, hdr, xd.headers[hdr]))
utils.warn("%s: '%s' header not known: '%s'" % (xd.filename, hdr, xd.headers[hdr]))

# clean Author and Editor headers
author = xd.get_header("Author") or ""
Expand All @@ -122,8 +122,7 @@ def clean_headers(xd):

if newtitle != title:
xd.set_header("Title" + CLEAN_SUFFIX, newtitle)

# create Date header
# create Date header
dt = xd.get_header("Date")

## try getting Date from filename
Expand All @@ -133,7 +132,7 @@ def clean_headers(xd):
if d:
dt = d.strftime("%Y-%m-%d")
except Exception as e:
utils.log(str(e))
utils.error(str(e))
if args.debug:
raise

Expand Down
14 changes: 6 additions & 8 deletions scripts/25-analyze-puzzle.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#

from queries.similarity import find_similar_to, find_clue_variants, load_clues, load_answers, grid_similarity
from xdfile.utils import get_args, open_output, find_files, log, debug, get_log, COLUMN_SEPARATOR, EOL, parse_tsv, progress, parse_pathname
from xdfile.utils import get_args, open_output, find_files, log, info, debug, get_log, COLUMN_SEPARATOR, EOL, parse_tsv, progress, parse_pathname
from xdfile import xdfile, corpus, ClueAnswer, BLOCK_CHAR
import time
from xdfile import utils, metadatabase
Expand All @@ -27,25 +27,24 @@ def main():
if mainxd.xdid() in prev_similar:
continue # skip reprocessing .xd that are already in similar.tsv

""" find similar grids (pct, xd) for the mainxd in the corpus.
""" find similar grids (pct, xd) for the mainxd in the corpus.
Takes about 1 second per xd. sorted by pct.
"""
similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20),
similar_grids = sorted(find_similar_to(mainxd, corpus(), min_pct=0.20),
key=lambda x: x[0], reverse=True)

if similar_grids:
log("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct))
info("similar: " + " ".join(("%s=%s" % (xd2.xdid(), pct))
for pct, xd1, xd2 in similar_grids))

mainpubid = mainxd.publication_id()
maindate = mainxd.date()

# go over each clue/answer, find all other uses, other answers, other possibilities.
# go over each clue/answer, find all other uses, other answers, other possibilities.
# these are added directly to similar.tsv
nstaleclues = 0
nstaleanswers = 0
ntotalclues = 0

for pos, mainclue, mainanswer in mainxd.iterclues():
progress(mainanswer)

Expand Down Expand Up @@ -77,7 +76,7 @@ def main():
uses = []
for bc, nuses in bclues.items():
# then find all clues besides this one
clue_usages = [ ca for ca in load_clues().get(bc, [])
clue_usages = [ ca for ca in load_clues().get(bc, [])
if ca.answer == mainanswer and ca.date < maindate ]

if clue_usages:
Expand All @@ -89,7 +88,6 @@ def main():
else:
ca = sorted(clue_usages, key=lambda ca: ca.date or "z")[-1]
uses.append((ca, nuses))

# summary row to similar.tsv
row_header = 'xdid similar_grid_pct reused_clues reused_answers total_clues matches'
metadatabase.append_row('gxd/similar.tsv', row_header, [
Expand Down
6 changes: 2 additions & 4 deletions scripts/31-mkwww-publishers.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def main():
puzzles = metadb.xd_puzzles()
outf.write_html('pub/index.html', pubyear.pubyear_html(), title='The xd crossword puzzle corpus')

utils.log("collating puzzles")
utils.info("collating puzzles")
for puzrow in puzzles.values():
pubid = utils.parse_pubid(puzrow.xdid)
year = xdfile.year_from_date(puzrow.Date)
Expand All @@ -94,11 +94,9 @@ def main():
all_pubs[k].add(puzrow)

pubyear_header = [ 'xdid', 'Date', 'Size', 'Title', 'Author', 'Editor', 'Copyright', 'Grid_1A_1D', 'ReusedCluePct', 'SimilarGrids' ]
utils.log('generating index pages')

utils.info('generating index pages')
# dict to generate pub page with calendars
pub_grids = defaultdict(dict)

for pair, pub in sorted(list(all_pubs.items())):
c_grids = {}
pubid, year = pair
Expand Down
16 changes: 4 additions & 12 deletions scripts/35-mkwww-diffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ def main():

similars = utils.parse_tsv('gxd/similar.tsv', 'Similar')
xdids_todo = args.inputs or [ xdid for xdid, matches in metadb.get_similar_grids().items() if matches ]

for mainxdid in xdids_todo:
progress(mainxdid)

Expand All @@ -102,7 +101,6 @@ def main():
xddates[mainxdid] = mainxd.date() # Dict to store XD dates for further sort
html_grids = {}
html_clues = {}

# Store in list to make further formatting as html table easier
html_grids[mainxdid] = grid_diff_html(xdfile.get_xd(mainxdid))

Expand All @@ -114,7 +112,6 @@ def main():
diff_h += mktag('span', tagclass='main', inner='&nbsp;~&nbsp;' + mainanswer.upper())
diff_l.append(diff_h)
html_clues[mainxdid] = diff_l

# Process for all matches
for xdid in matches:
xd = xdfile.get_xd(xdid)
Expand All @@ -124,11 +121,10 @@ def main():
xddates[xdid] = xd.date()
# output each grid
html_grids[xdid] = grid_diff_html(xd, compare_with=mainxd)

diff_l = []
# output comparison of each set of clues
for pos, clue, answer in xd.iterclues():
diff_h = mktag('div','fullgrid') + '%s.&nbsp;' %pos
diff_h = mktag('div','fullgrid') + '%s.&nbsp;' %pos
# Sometimes can return clue == None
sm = difflib.SequenceMatcher(lambda x: x == ' ', mainxd.get_clue(pos) or '', clue)
if sm.ratio() < 0.50:
Expand All @@ -141,30 +137,26 @@ def main():
diff_h += '<span class="match">%s</span>' % clue[b1:b2]
else:
diff_h += '<span class="diff">%s</span>' % clue[b1:b2]

diff_h += mktag('span', tagclass=(answer == mainxd.get_answer(pos)) and 'match' or 'diff', inner='&nbsp;~&nbsp;' + answer.upper())
diff_h += mktag('/div')
diff_l.append(diff_h)
html_clues[xdid] = diff_l

html_clues[xdid] = diff_l

# Wrap into table
diff_h = mktag('table') + mktag('tr')
# Sort by date
sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1))
sortedkeys = sorted(xddates.items(), key=operator.itemgetter(1))
for w, dt in sortedkeys:
# Wrap into table
diff_h += mktag('td') + html_grids[w] + mktag('/td')
diff_h += mktag('/tr')

for i, clue in enumerate(html_clues[sortedkeys[0][0]]):
diff_h += mktag('tr')
for w, dt in sortedkeys:
if i < len(html_clues[w]):
diff_h += mktag('td') + html_clues[w][i] + mktag('/td')
diff_h += mktag('/tr')
diff_h += mktag('/tr')
diff_h += mktag('/table')

outf.write_html('pub/%s/index.html' % mainxdid, diff_h, title='Comparison for ' + mainxdid)


Expand Down
2 changes: 1 addition & 1 deletion scripts/40-deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ aws s3 sync --region $REGION $WWW ${S3WWW}/ --acl public-read
# concatenate all logfiles from working dirs and copy to cloud
ALLLOGS=$WWW/log/$TODAY-logs.txt
scripts/49-cat-logs.py -o $ALLLOGS $PUB $TMP
aws s3 cp --region $REGION $ALLLOGS ${S3WWW}/log/ --acl public-read
aws s3 cp --region $REGION $ALLLOGS ${S3WWW}/logs/ --acl public-read

7 changes: 3 additions & 4 deletions scripts/rewrite_corpus
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

from xdfile.utils import get_args, get_parser, open_output, log
from xdfile.utils import get_args, get_parser, open_output, log, info, error, warn

import xdfile
import os
Expand All @@ -9,7 +9,6 @@ import os
def collapse_whitespace(s):
return u"".join(x.strip() for x in s.splitlines()).strip()


if __name__ == "__main__":
p = get_parser("rewrite corpus")
p.add_argument('--noclues', nargs='?', help='omit clues')
Expand All @@ -28,10 +27,10 @@ if __name__ == "__main__":
outxdtt = xd.transpose().transpose()
if collapse_whitespace(outxd) != collapse_whitespace(outxdtt):
for a, b in xd.diffs(outxdtt):
log("diff: %s | %s" % (a, b))
info("diff: %s | %s" % (a, b))
raise Exception("differs when double-transposed")
except Exception, e:
log(unicode(e))
error(unicode(e))
if args.debug:
raise

Expand Down
4 changes: 2 additions & 2 deletions scripts/tsv2sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import sqlite3
import xdfile.utils
from xdfile.utils import args_parser, get_args
from xdfile.utils import args_parser, get_args, info
from xdfile import metadatabase as metadb


Expand All @@ -19,7 +19,7 @@ def main():
cur = sqlconn.cursor()

rows = [list(r) for r in xdfile.utils.parse_tsv_rows(args.inputs[0], "Receipt")]
print("Rows to be inserted to sql: %s" % len(rows))
info("Rows to be inserted to sql: %s" % len(rows))
cur.executemany('INSERT INTO receipts VALUES (?,?,?,?,?,?)', rows)
sqlconn.commit()

Expand Down
Loading