Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Staging #29

Merged
merged 22 commits into from
Sep 2, 2016
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions scripts/05-sql-import-receipts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,8 @@ METADB=meta.db
if [ ! -f $METADB ] ; then
sqlite3 $METADB < ./scripts/meta.sql
./scripts/tsv2sqlite.py ${DEBUG} -o ${METADB} gxd/receipts.tsv
./scripts/tsv2sqlite.py ${DEBUG} -o ${METADB} gxd/publications.tsv
./scripts/tsv2sqlite.py ${DEBUG} -o ${METADB} gxd/similar.tsv
fi


29 changes: 21 additions & 8 deletions scripts/22-pubyears.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,21 @@ def main():
args = utils.get_args('generate pub-years data')

pubyears = [ (utils.parse_pubid(r.xdid), year_from_date(r.Date), dow_from_date(r.Date))
for r in metadb.xd_puzzles().values() ]

weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ]
for r in metadb.xd_puzzles().values() ]

weekdays = [ 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun' ]

metasql.execute("INSERT INTO stats VALUES (?,?,?, ?,?, ?, ?,?,?, ?,?, ?,?)",
(pubid, year, weekday,
maineditor, maincopyright,
nexisting, nxd, npublic,
reprints, touchups, redones,
copies, themecopies))





pubs = defaultdict(dict)
for pubid, year, dow in pubyears:
if pubid not in pubs or int(year) not in pubs[pubid]:
Expand All @@ -29,12 +41,13 @@ def main():
if y < 1900 or y > 2100:
continue
# Preserve weekday order
dow_list = []
for d in weekdays:
dow_list.append(str(years_dow[y][d]))

metadb.append_row('pub/pubyears.tsv', "pubid year total " + " ".join(weekdays),
[ pubid, y, sum(years_dow[y].values()), "\t".join(dow_list) ])
ndup = 0
nsusp = 0
for r in metasql.xd_similar():
"SELECT * FROM similar WHERE pubid = ? AND year = ?", pubid, y)
("INSERT INTO pubyears VALUES (?, ?, ?, ?, ?)metadb.append_row('pub/pubyears.tsv', "pubid year weekday total ndup nsusp",
[ pubid, y, d, years_dow[y][d], ndup, nsusp])

if __name__ == "__main__":
main()
112 changes: 112 additions & 0 deletions scripts/37-pubyear-svg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/python3

from datetime import date
from xdfile import utils
import xdfile

pys = '''
<svg class="year_widget" width="30" height="30">
<g transform="translate(0,0)">
<rect class="%s" width="30" height="30"></rect>
</g>
%s
</svg>
'''


def rect(x, y, w, h, *classes):
return '<rect transform="translate({x},{y})" class="{classes}" width="{w}" height="{h}"></rect>'.format(x=x, y=y, w=w, h=h, classes=''.join(classes))


def year_from(dt):
return int(dt.split('-')[0])

def weekdays_between(dta, dtb):
return 0


def pubyear_svg(corpus, nsusp, ndup, npub, npriv):
bgclass = "notexists"
# if bgclass not in publications.tsv:
# bgclass = "exists"

rects = ''

for i in range(0, 7):
y = i*3

# TODO: find first xd of weekday i
firstxd = corpus[i]
lastxd = corpus[1-i]

sz = firstxd.width() * firstxd.height()
h = 3 if sz > 17*17 else 2

x = 0
w = 6

rects += '''<g id="mon" transform="translate(0,{y})">'''.format(y=y)

npre = weekdays_between(date(year_from(firstxd.Date), 1, 1), firstxd.Date, i)
w = npre
rects += rect(x, y, w, h, 'prexd')
x += w

w = nsusp
rects += rect(x, y, w, h, 'suspxd')
x += w

w = ndup
rects += rect(x, y, w, h, 'dupxd')
x += w

w = npriv
rects += rect(x, y, w, h, 'privxd')
x += w

w = npub
rects += rect(x, y, w, h, 'pubxd')
x += w

npost = weekdays_between(lastxd.Date, date(year_from(lastxd.Date), 12, 31), i)
w = npost
rects += rect(x, y, w, h, 'postxd')
rects += '</g>'

return pys % (bgclass, rects)


def main():
p = utils.args_parser(desc="annotate puzzle clues with earliest date used in the corpus")
p.add_argument('-a', '--all', default=False, help='analyze all puzzles, even those already in similar.tsv')
args = utils.get_args(parser=p)
outf = utils.open_output()

prev_similar = utils.parse_tsv('gxd/similar.tsv', "similar")
pubyears = {}
for xd in xdfile.corpus():
pubyear = xd.publication_id() + str(xd.year())
if pubyear not in pubyears:
pubyears[pubyear] = []
pubyears[pubyear].append(xd)

print('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
<meta charset="utf-8">
<meta name="description" content="">
<meta name="keywords" content="">
<meta name="author" content="">
<title></title>
<link href="style.css" rel="stylesheet" type="text/css">
<script src="script.js"></script>
</head>
<body>
%s
</body>
</html>''' % pubyear_svg(pubyears['up2011'], 'up', 2011))


if __name__ == "__main__":
main()

84 changes: 76 additions & 8 deletions scripts/meta.sql
Original file line number Diff line number Diff line change
@@ -1,11 +1,79 @@
CREATE TABLE "receipts" (
"CaptureTime" TEXT,
"ReceivedTime" TEXT,
"ExternalSource" TEXT,
"InternalSource" TEXT,
"SourceFilename" TEXT,
"xdid" TEXT
-- xd

CREATE TABLE receipts (
CaptureTime TEXT,
ReceivedTime TEXT,
ExternalSource TEXT,
InternalSource TEXT,
SourceFilename TEXT,
xdid CHAR(16),
PRIMARY KEY (ExternalSource, SourceFilename)
);

CREATE INDEX XDID on receipts (xdid ASC);


CREATE TABLE similar_grids (
xdid CHAR(16),
xdidMatch CHAR(16),
GridMatchPct INTEGER
);


CREATE TABLE similar_clues (
xdid CHAR(16),
reused_clues INTEGER,
Nujmreused_answers INTEGER,
total_clues INTEGER,
);


CREATE TABLE publications (
PublicationAbbr CHAR(8),
PublisherAbbr CHAR(8),
PublicationName TEXT,
PublisherName TEXT,
FirstIssueDate CHAR(10),
LastIssueDate CHAR(10),
NumberIssued INTEGER,
Contact TEXT,
Sources TEXT
);

CREATE INDEX "XDID" on receipts (xdid ASC);

CREATE TABLE puzzles (
xdid CHAR(16), -- "eltana-001"
Date CHAR(10), -- "2016-07-18"
Size CHAR(8), -- "15x15RS" (Rebus/Special)

Title TEXT,
Author TEXT,
Editor TEXT,
Copyright TEXT,
A1_D1 TEXT,
);


-- grouped by pub-year-weekday
CREATE TABLE pubyears (
pubid CHAR(6), -- "nyt"
year CHAR(4), -- "2006"
weekday CHAR(3), -- "Mon"

Editor TEXT, -- most common entry
Copyright TEXT, -- most common, after removing Date/Author

NumExisting INTEGER, -- known or assumed to be in existence (0 means unknown)
NumXd INTEGER, -- total number in xd
NumPublic INTEGER, -- available for public download

-- duplicate grids, same author
NumReprints INTEGER, -- 100% grid match
NumTouchups INTEGER, -- 75-99% grid match
NumRedone INTEGER, -- 30-75% grid match

-- duplicate grids, different author
NumSuspicious INTEGER, -- >50% similar grid
MaxSuspiciousPct INTEGER, -- highest grid match of all suspicious
);

61 changes: 42 additions & 19 deletions xdfile/pubyear.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import cgi
from collections import Counter, defaultdict
from collections import defaultdict

from xdfile.html import th, td, mkhref, mktag, tr_empty, td_with_class, year_widget, decade_widget
from xdfile.html import mkhref, mktag, tr_empty, td_with_class
from xdfile import utils, metadatabase as metadb
from xdfile.utils import space_with_nbsp
import xdfile
from datetime import date


Expand All @@ -18,13 +16,13 @@ def mkcell(text, href="", title=""):
def split_year(y):
lsy = str(y)[2:]
if y[3] != '0':
#msy = ' ' # unicode M space
msy = '&nbsp;' # Changed to &nbsp;
msy = '&nbsp;' # Changed to &nbsp;
else:
msy = str(y)[:2]

return "%s<br/>%s" % (msy, lsy)


def get_pubheader_classes(*years):
"""
Assign classes to years header
Expand All @@ -34,9 +32,34 @@ def get_pubheader_classes(*years):
if "&nbsp" in str(y):
classes.append("ord-year")
else:
classes.append("zero-year")
classes.append("zero-year")
return classes



def year_widget(dow_dict, total, fill_class=None):
# Generate SVG based widget for day of week dispersion for year
fill_class = fill_class or 'white'
b = []
b.append('<svg class="year_widget" width="30" height="30">')
b.append('<g transform="translate(0,0)"><rect class="%s" width="30" height="30"></rect></g>' % fill_class)
for i, v in enumerate(utils.WEEKDAYS):
_class = dow_dict[v]['class'] if 'class' in dow_dict[v].keys() else ''
_length = str(dow_dict[v]['count']) if 'count' in dow_dict[v].keys() else '0'
_length = _length if int(_length) < 26 else '30' # for all 52/2 have full filled row
b.append('<g transform="translate(0,' + str(i*3+i) + ')"><rect class="' + _class + '" width="' + _length + '" height="3"></rect></g>')
b.append('</svg>')
return(' '.join(b))

def decade_widget(total, fill_class=None):
# Generate SVG based widget for decade showing total
fill_class = fill_class or 'green'
b = []
b.append('<svg class="year_widget" width="30" height="30">')
b.append('<g transform="translate(0,0)"><rect class="%s" width="30" height="30"></rect></g>' % fill_class)
b.append('<text x="25" y="18">' + str(total) + '</text>')
b.append('</svg>')
return(' '.join(b))


g_all_pubyears = None
def pubyear_html(pubyears=[], skip_decades=None):
Expand All @@ -47,9 +70,9 @@ def pubyear_html(pubyears=[], skip_decades=None):
if not g_all_pubyears:
g_all_pubyears = utils.parse_tsv_data(open("pub/pubyears.tsv").read(), "pubyear")


# Read similars to make background of widgets
similar_d = defaultdict(dict)
similar_d = defaultdict(dict)
for xdid, v in utils.parse_tsv('gxd/similar.tsv', "similar").items():
xd_split = utils.split_xdid(xdid)
if xd_split:
Expand All @@ -61,7 +84,7 @@ def pubyear_html(pubyears=[], skip_decades=None):

b = [] # Body

# Making collapsed decaded depends on args
# Making collapsed decades depends on args
skip_decades = skip_decades if skip_decades else { 'start': 1910, 'end': 1970 }
allyears = []
for i in range(skip_decades['start']//10, skip_decades['end']//10 + 1):
Expand Down Expand Up @@ -108,25 +131,25 @@ def pubyear_html(pubyears=[], skip_decades=None):
'hint': hint,
'total': int(total),
}

# main table
b.append('<table class="pubyears">')
yhdr = [ '&nbsp;' ] + [ split_year(y) for y in allyears ]
yhdr.append("all")
b.append(td_with_class(*yhdr, classes=get_pubheader_classes(*yhdr),
rowclass="pubyearhead",tag="th"))
b.append(tr_empty())
# Process each pubid sorted by earliest year
b.append(tr_empty())

# Process each pubid sorted by earliest year
for pubid in sorted(pubs, key=lambda x:min(pubs[x])):
pub = metadb.xd_publications().get(pubid)
pubname = pub.PublicationName if pub else ''
# Pub id to first column
# Pub id to first column
b.append(mktag('tr'))
b.append(mktag('td','pub'))
b.append(mkcell(space_with_nbsp(pubname or pubid), "/pub/" + pubid, ))
b.append(mktag('/td'))

# Process each year not collapsed into decade
for yi in allyears:
if yi in pubs[pubid] and pubs[pubid][yi]['total'] > 0:
Expand All @@ -140,11 +163,11 @@ def pubyear_html(pubyears=[], skip_decades=None):
b.append(mktag('td', 'block'))
b.append('&nbsp;')
b.append(mktag('/td'))

b.append(mktag('td'))
b.append(str(sum([ pubs[pubid][x]['total'] for x in pubs[pubid].keys() ])))
b.append(mktag('/td'))
b.append(mktag('/tr'))

b.append(mktag('/table'))
return (" ".join(b))