Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Staging #29

Merged
merged 22 commits into from
Sep 2, 2016
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion scripts/05-sql-import-receipts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,11 @@ METADB=meta.db

if [ ! -f $METADB ] ; then
sqlite3 $METADB < ./scripts/meta.sql
./scripts/tsv2sqlite.py ${DEBUG} -o ${METADB} gxd/receipts.tsv
./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Receipt" -o ${METADB} gxd/receipts.tsv
./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Publication" -o ${METADB} gxd/publications.tsv
./scripts/tsv2sqlite.py ${DEBUG} --tsvtype "Similar" -o ${METADB} gxd/similar.tsv
else
echo "$METADB already exists"
fi


3 changes: 2 additions & 1 deletion scripts/20-analyze.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ mkdir -p $PUB
rm -f $PUB/*

# regenerate pub/puzzles.tsv
# TODO: should populate puzzles table in sqlite instead
scripts/21-clean-metadata.py -o $PUB/puzzles.tsv $GXD

# regenerate pub/pubyears.tsv
scripts/22-pubyears.py
scripts/25-analyze-puzzle.py -o $WWW/ -c $GXD $GXD
scripts/26-clues-tsv.py -c $GXD -o $PUB/
scripts/27-pubyear-stats.py -c ${GXD}
165 changes: 165 additions & 0 deletions scripts/21b-clean-metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env python3

# Usage: $0 [-o <puzzles.tsv>] <input>
#
# Generates puzzles.tsv with cleaned metadata for each .xd in <input>.
#

from xdfile import utils, metasql as metadb
import xdfile
import re


CLEAN_SUFFIX = '_clean'


def find_date(s):
m = re.search(r"\s*(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)?\s*(\d{1,2})?,?\s*\d{4},?\s*", s, flags=re.IGNORECASE)
if m:
return m.group(0)

m = re.search(r"\d{2}[/\-]?\d{2}[/\-]?\d{2,4}", s)
if m:
return m.group(0)

return ""


def boil_copyright(copyright, author):
import re
if author:
copyright = copyright.replace(author, "")

# and remove textual date
dt = find_date(copyright)
if dt:
copyright = copyright.replace(dt, " ")

# copyright = copyright.replace(u"©", "(c)")

return copyright


# also editor
def clean_author(author, editor):
if author:
r = r'(?i)(?:(?:By )*(.+)(?:[;/,-]|and) *)?(?:edited|Editor|(?<!\w)Ed[.])(?: By)*(.*)'
m = re.search(r, author)
if m:
author, editor = m.groups()

if author:
while author.lower().startswith("by "):
author = author[3:]

while author[-1] in ",.":
author = author[:-1]
else:
author = ""

if " / " in author:
if not editor:
author, editor = author.rsplit(" / ", 1)

if editor:
while editor.lower().startswith("by "):
editor = editor[3:]

while editor[-1] in ",.":
editor = editor[:-1]

author = author.strip()
editor = editor.strip()
return author, editor


def clean_title(title):
if title.endswith(']'):
title = title[:title.rfind('[')]

# title is only between the double-quotes for some USAToday
if title.startswith("USA Today"):
if title and title[-1] == '"':
title = title[title.index('"') + 1:-1]
if title[-1] == ",":
title = title[:-1]
elif title and title[0] == '"':
title = title[1:title.rindex('"')]

return title


def clean_headers(xd):
# remove known unwanted header fields, log unknown headers
for hdr in list(xd.headers.keys()):
if hdr in ["Source", "Identifier", "Acquired", "Issued", "Category"]:
xd.set_header(hdr, None)
else:
if hdr.lower() not in xdfile.HEADER_ORDER:
utils.warn("%s: '%s' header not known: '%s'" % (xd.filename, hdr, xd.headers[hdr]))

# clean Author and Editor headers
author = xd.get_header("Author") or ""
if not author:
if xd.get_header("Creator"):
assert not author
author = xd.get_header("Creator")
xd.set_header("Creator", None)

editor = xd.get_header("Editor") or ""

newauthor, neweditor = clean_author(author, editor)

if newauthor != author:
xd.set_header("Author" + CLEAN_SUFFIX, newauthor)

if neweditor != editor:
xd.set_header("Editor" + CLEAN_SUFFIX, neweditor)

# clean Title header
title = xd.get_header("Title") or ""
newtitle = clean_title(title)

if newtitle != title:
xd.set_header("Title" + CLEAN_SUFFIX, newtitle)
# create Date header
dt = xd.get_header("Date")

## try getting Date from filename
if not dt:
try:
d = utils.parse_date_from_filename(xd.filename)
if d:
dt = d.strftime("%Y-%m-%d")
except Exception as e:
utils.error(str(e))
if args.debug:
raise

## try getting Date from copyright
if not dt:
rights = xd.get_header("Copyright") or ""
dt = find_date(rights)

if dt:
xd.set_header("Date", dt)



def main():
args = utils.get_args(desc='outputs cleaned puzzle metadata rows')

xd_list = []
# Prepaire array first
for input_source in args.inputs:
for fn, contents in utils.find_files(input_source, ext='.xd'):
xd = xdfile.xdfile(contents.decode('utf-8'), fn)
clean_headers(xd)
xd_list.append(xd)
# metadb.update_puzzles_row(xd)
# Put all into sql
metadb.append_puzzles(xd_list)


if __name__ == "__main__":
main()
40 changes: 0 additions & 40 deletions scripts/22-pubyears.py

This file was deleted.

144 changes: 144 additions & 0 deletions scripts/27-pubyear-stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env python3

import json
import re
from collections import defaultdict, Counter

from xdfile.utils import error, debug, info
from xdfile import utils, metasql, metadatabase as metadb
from xdfile import year_from_date, dow_from_date
import xdfile



def main():
args = utils.get_args('generate pub-years data')
outf = utils.open_output()

weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ]

pubyears = {} # set()
for xd in xdfile.corpus():
puby = (xd.publication_id(), xd.year())
if puby not in pubyears:
pubyears[puby] = []
pubyears[puby].append(xd)

if pubyears:
metasql.execute("DELETE FROM stats;")

for puby, xdlist in sorted(pubyears.items()):
pubid, year = puby
npublic = 0

# TODO: SELECT FROM publications
nexisting = 0

# organize by day-of-week
byweekday = {}
byweekday_similar = {}
for w in weekdays:
byweekday[w] = []
byweekday_similar[w] = []

for xd in xdlist:
dow = dow_from_date(xd.get_header('Date'))
if dow: # Might be empty date or only a year
byweekday[dow].append(xd)

for r in metasql.select("SELECT * FROM similar_grids WHERE xdid LIKE '{}%' AND GridMatchPct > 25".format(pubid + str(year))):
xd = xdfile.get_xd(r['xdid'])
if xd:
dt = xd.get_header('Date')
if dt:
assert dt
dow = dow_from_date(dt)
if dow: # Might be empty date or only a year
byweekday_similar[dow].append(r)
else:
debug("Date not set for: %s" % xd)

# tally stats
for weekday in weekdays:
copyrights = Counter() # [copyright_text] -> number of xd
editors = Counter() # [editor_name] -> number of xd
formats = Counter() # ["15x15 RS"] -> number of xd
# todo
nexisting = 0

nxd = len(byweekday[weekday])
public_xdids = [] # Empty for now
for xd in byweekday[weekday]:
xdid = xd.xdid()
if (year.isdigit() and int(year) <= 1965) or xdid in public_xdids:
npublic += 1

editor = xd.get_header('Editor').strip()
if editor:
editors[editor] += 1

sizestr = xd.sizestr()
if sizestr:
formats[sizestr] += 1

copyright = xd.get_header('Copyright').strip()
if copyright:
copyrights[copyright] += 1

# debug("ME: %s MCPR: %s MF: %s" % (list(editors), list(copyrights), list(formats)))
def process_counter(count, comp_value):
# Process counter comparing with comp_value
if count:
item, num = count.most_common(1)[0]
if num != comp_value:
item += " (%s)" % num
else:
item = ''
return item

#
maineditor = process_counter(editors, nxd)
maincopyright = process_counter(copyrights, nxd)
mainformat = process_counter(formats, nxd)

reprints = 0
touchups = 0
redones = 0
copies = 0
themecopies = 0
for r in byweekday_similar[weekday]:
# debug("Xdid %s Xdidmatch %s" % (r['xdid'], r['xdidMatch']))
xd1 = xdfile.get_xd(r['xdid'])
xd2 = xdfile.get_xd(r['xdidMatch'])
if xd1 is None or xd2 is None:
continue
# debug("XD1: %s XD2: %s" % (xd1, xd2))
dt1 = xd1.get_header('Date')
dt2 = xd2.get_header('Date')
aut1 = xd1.get_header('Author')
aut2 = xd2.get_header('Author')
pct = int(r['GridMatchPct'])
if dt2 < dt1: # only capture the later one
if aut1 == aut2:
if pct == 100:
reprints += 1
elif pct >= 50:
touchups += 1
elif pct >= 30:
themecopies += 1
else: # suspicious
if pct >= 50:
copies += 1
elif pct >= 30:
themecopies += 1

metasql.execute("INSERT INTO stats VALUES (?,?,?, ?,?,?, ?, ?,?,?, ?,?, ?,?)",
(pubid, year, weekday,
mainformat, maineditor, maincopyright,
nexisting, nxd, npublic,
reprints, touchups, redones,
copies, themecopies))


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions scripts/30-mkwww.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ cp $PUB/*.tsv $WWW/pub/
echo -en "${GREEN}Generate /pub/[<pub>][<year>]${NORMAL}\n"
scripts/31-mkwww-publishers.py $CORPUS -o $WWW/

echo -en "${GREEN}Generate /pub/ index${NORMAL}\n"
scripts/37-pubyear-svg.py -o $WWW/

echo -en "${GREEN}Generate /pub/word/<ANSWER>${NORMAL}\n"
scripts/33-mkwww-words.py $CORPUS -o $WWW/

Expand Down
Loading