-
Notifications
You must be signed in to change notification settings - Fork 767
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Rewrite Ancestry.com Federal Census for current site (#2582)
- Loading branch information
1 parent
edfee55
commit 59facf0
Showing
1 changed file
with
137 additions
and
210 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,237 +1,164 @@ | ||
{ | ||
"translatorID": "0dda3f89-15de-4479-987f-cc13f1ba7999", | ||
"label": "Ancestry.com US Federal Census", | ||
"creator": "Elena Razlogova", | ||
"target": "^https?://search\\.ancestry\\.com/.*(usfedcen|1890orgcen|1910uscenindex)", | ||
"minVersion": "1.0.0b4.r1", | ||
"creator": "Abe Jellinek", | ||
"target": "^https?://(www\\.)?ancestry\\.com/", | ||
"minVersion": "3.0", | ||
"maxVersion": "", | ||
"priority": 100, | ||
"inRepository": true, | ||
"translatorType": 4, | ||
"browserSupport": "g", | ||
"lastUpdated": "2015-06-02 10:57:09" | ||
"browserSupport": "gcibv", | ||
"lastUpdated": "2021-07-22 19:20:28" | ||
} | ||
|
||
function detectWeb(doc, url) { | ||
var namespace = doc.documentElement.namespaceURI; | ||
var nsResolver = namespace ? function(prefix) { | ||
if (prefix == 'x') return namespace; else return null; | ||
} : null; | ||
|
||
var result = doc.evaluate('//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]', doc, nsResolver, | ||
XPathResult.ANY_TYPE, null).iterateNext(); | ||
/* | ||
***** BEGIN LICENSE BLOCK ***** | ||
var rows = doc.evaluate('//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]/table/tbody/tr[@class="tblrow record"]', | ||
doc, nsResolver, XPathResult.ANY_TYPE, null); | ||
var row; | ||
while (row = rows.iterateNext()) { | ||
links = doc.evaluate('.//a', row, nsResolver, XPathResult.ANY_TYPE, null); | ||
var linkNo=0; | ||
while (link=links.iterateNext()) { | ||
linkNo=linkNo+1; | ||
} | ||
break; | ||
} | ||
Copyright © 2021 Abe Jellinek | ||
if (result && linkNo == 2) { | ||
return "multiple"; | ||
} else { | ||
var indivRe = /indiv=1/; | ||
var m = indivRe.exec(doc.location.href); | ||
var indiv = 0; | ||
if (m) { | ||
indiv = 1; | ||
} | ||
This file is part of Zotero. | ||
checkURL = doc.location.href.replace("pf=", ""); | ||
if (doc.location.href == checkURL && indiv == 1) { | ||
return "bookSection"; | ||
} | ||
} | ||
} | ||
Zotero is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU Affero General Public License as published by | ||
the Free Software Foundation, either version 3 of the License, or | ||
(at your option) any later version. | ||
// this US Federal Census scraper is a hack - so far there is no proper item type in Zotero for this kind of data (added to trac as a low priority ticket) | ||
// this scraper creates proper citation for the census as a whole (should be cited as book) | ||
// but also adds name, city, and state for a particular individual to the citation to make scanning for names & places easier in the middle pane | ||
// (that's why the resulting item type is a book section) | ||
// it also adds all searchable text as a snapshot and a scan of the census record as an image | ||
Zotero is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU Affero General Public License for more details. | ||
function scrape(doc) { | ||
var namespace = doc.documentElement.namespaceURI; | ||
var nsResolver = namespace ? function(prefix) { | ||
if (prefix == 'x') return namespace; else return null; | ||
} : null; | ||
|
||
// get initial census data; a proper census record item type should have separate fields for all of these except perhaps dbid | ||
var info = doc.evaluate('//div[@class="facets"][@id="connect"]/div[@class="g_box"]/p/a', | ||
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); | ||
|
||
if (info) { | ||
|
||
info = info.toString(); | ||
var data = new Array(); | ||
var parts = info.split(/[?&]/); | ||
for (var i=0; i<parts.length; i++) { | ||
var part = parts[i]; | ||
var index = part.indexOf("="); | ||
if (index !== -1) { | ||
data[part.substr(0, index)] = part.substr(index+1); | ||
} | ||
} | ||
|
||
if (data.ln) { | ||
var lastName = data.ln.replace(/\+/g, " "); | ||
var firstName = data.fn.replace(/\+/g, " "); | ||
} else { | ||
var lastName = data.fn.replace(/\+/g, " "); | ||
var firstName = ""; | ||
} | ||
var dOb = data.by; // this does not get saved yet because no field is available; the info is in the snapshot | ||
if (data.rfd) { | ||
var yearRe = /([0-9]{4})/; | ||
var m = yearRe.exec(data.rfd); | ||
if (m) { | ||
var year = m[1]; | ||
} | ||
} else { var year = data.ry; } | ||
var state = data.rs.replace(/\+/g, " "); | ||
var county = data.rcnty.replace(/\+/g, " "); // this does not get saved yet because no field is available; the info is in the snapshot | ||
var city = data.rcty.replace(/\+/g, " "); | ||
var dbid = data.dbid; | ||
} | ||
|
||
// set census number for citation - let me know if this can be done in a better way | ||
var censusYear = 0; | ||
var censusNo = ""; | ||
var censusNos = new Array("1790", "First", "1800", "Second", "1810", "Third", "1820", "Fourth", "1830", "Fifth", "1840", "Sixth", "1850", "Seventh", "1860", "Eighth", "1870", "Ninth", | ||
"1880", "Tenth", "1890", "Eleventh", "1900", "Twelfth", "1910", "Thirteenth", "1920", "Fourteenth", "1930", "Fifteenth") | ||
for (var i in censusNos) { | ||
if (censusYear == 1) { censusNo = censusNos[i] }; | ||
if (censusNos[i] == year) { censusYear = 1 } else {censusYear= 0 }; | ||
} | ||
You should have received a copy of the GNU Affero General Public License | ||
along with Zotero. If not, see <http://www.gnu.org/licenses/>. | ||
//begin adding item | ||
var newItem = new Zotero.Item("bookSection"); | ||
newItem.title = city+", "+state; // this is not proper citation but is needed to easily scan for placenames in middle pane | ||
newItem.publicationTitle = censusNo+" Census of the United States, "+year; | ||
newItem.publisher = "National Archives and Records Administration"; | ||
newItem.place = "Washington, DC"; | ||
newItem.date = year; | ||
|
||
// get snapshot with all searchable text and a simplified link to the record for the URL field | ||
var dbRe = /db=([0-9a-z]+)/; | ||
var m = dbRe.exec(doc.location.href); | ||
if (m) { | ||
db = m[1]; | ||
} | ||
var snapshotRe = /\&h=([0-9]+)/; | ||
var m = snapshotRe.exec(doc.location.href); | ||
if (m) { | ||
snapshotURL = "http://search.ancestry.com/cgi-bin/sse.dll?db="+db+"&indiv=1&pf=1&h="+m[1]; | ||
newItem.attachments.push({title:"Ancestry.com Snapshot", mimeType:"text/html", url:snapshotURL, snapshot:true}); | ||
cleanURL = "http://search.ancestry.com/cgi-bin/sse.dll?indiv=1&db="+db+"&fh=0&h="+m[1]; | ||
newItem.url = cleanURL; | ||
} | ||
|
||
// add particular individual being surveyed as contributor - this is not proper citation but is needed so one could easily scan for names in middle pane | ||
var creator = new Array(); | ||
creator.firstName = firstName; | ||
creator.lastName = lastName; | ||
creator.creatorType = "author"; | ||
newItem.creators.push(creator); | ||
|
||
//add proper author for citation | ||
var creator = new Array(); | ||
creator.lastName = "United States of America, Bureau of the Census"; | ||
creator.creatorType = "contributor"; | ||
newItem.creators.push(creator); | ||
***** END LICENSE BLOCK ***** | ||
*/ | ||
|
||
// get scan of the census image | ||
var scanInfo = doc.evaluate('//div[@id="record-main"]/table[@class="p_recTable"]/tbody/tr/td[2][@class="recordTN"]/a', | ||
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); | ||
|
||
if (scanInfo) { | ||
var scanRe = /iid=([A-Z0-9_-]+)/; | ||
var m = scanRe.exec(scanInfo); | ||
if (m) { | ||
scanURL = "http://content.ancestry.com/Browse/print_u.aspx?dbid="+dbid+"&iid="+m[1]; | ||
Zotero.debug("scan url: " + scanURL); | ||
} | ||
} | ||
|
||
if (scanURL){ | ||
Zotero.Utilities.HTTP.doGet(scanURL, function(text) { | ||
Zotero.debug("running doGet"); | ||
Zotero.debug(text); | ||
var imageRe = /950 src="([^"]+)"/; | ||
var m = imageRe.exec(text); | ||
if (m) { | ||
imageURL = m[1]; | ||
Zotero.debug("image url: " + imageURL); | ||
newItem.attachments.push({title:"Ancestry.com Image", mimeType:"image/jpeg", url:imageURL, snapshot:true}); | ||
} | ||
|
||
newItem.complete(); | ||
Zotero.done(); | ||
}); | ||
} else { | ||
newItem.complete(); | ||
Zotero.done(); | ||
|
||
function detectWeb(doc, url) { | ||
if (text(doc, '.pageTitle .pageIntro a').includes('United States Federal Census')) { | ||
return "bookSection"; | ||
} | ||
return false; | ||
} | ||
|
||
function doWeb(doc, url) { | ||
var resultsRegexp = /&h=/; | ||
if (resultsRegexp.test(url)) { | ||
scrape(doc); | ||
} else { | ||
var namespace = doc.documentElement.namespaceURI; | ||
var nsResolver = namespace ? function(prefix) { | ||
if (prefix == 'x') return namespace; else return null; | ||
} : null; | ||
scrape(doc, url); | ||
} | ||
|
||
function scrape(doc, url) { | ||
let item = new Zotero.Item('bookSection'); | ||
|
||
item.title = text(doc, '.pageTitle span'); | ||
let [date, place, roll, page] = doc.querySelectorAll('.sourceText em'); // not ideal | ||
item.bookTitle = text(doc, '.pageTitle .pageIntro a').trim() | ||
+ ` [${place.textContent}]`; | ||
item.publisher = 'National Archives and Records Administration'; | ||
// technically the Census is published 72 years after it's taken, but citing | ||
// that way doesn't seem to be the convention. | ||
item.date = date.textContent; | ||
item.pages = `${page.textContent} (roll ${roll.textContent})`; | ||
item.archive = 'Ancestry.com'; | ||
item.url = url.replace(/[?#].*/, ''); | ||
|
||
let recordTable = doc.querySelector('#recordServiceData'); | ||
if (recordTable) { | ||
recordTable = recordTable.cloneNode(true); | ||
|
||
// get census year for links to items | ||
var yearRe = /db=([0-9]+)/; | ||
var m = yearRe.exec(doc.location.href); | ||
if (m) { | ||
year = m[1]; | ||
} | ||
let familyMembers = recordTable.querySelector('.tableContainerRow'); | ||
if (familyMembers) familyMembers.remove(); | ||
|
||
var dbRe = /db=([0-9a-z]+)/; | ||
var m = dbRe.exec(doc.location.href); | ||
if (m) { | ||
db = m[1]; | ||
} | ||
item.notes.push({ | ||
note: ZU.trimInternal(recordTable.outerHTML) | ||
}); | ||
} | ||
|
||
//select items | ||
var items = new Array(); | ||
var listElts = doc.evaluate('//tr[@class="tblrow record keySelect"] | //tr[@class="tblrow record"] | //tr[@class="tblrowalt record"]', | ||
doc, nsResolver, XPathResult.ANY_TYPE, null); | ||
var recid; | ||
var link; | ||
var name; | ||
while (listElt = listElts.iterateNext()) { | ||
recInfo = doc.evaluate('.//a', listElt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); | ||
var recidRe = /recid=([0-9]+)/; | ||
var m = recidRe.exec(recInfo); | ||
if (m) { | ||
recid = m[1]; | ||
let imageSrc = attr(doc, '.photo.clickable img', 'src'); | ||
let dbId = imageSrc.match(/\/namespaces\/([^/]+)/)[1]; | ||
let imageId = imageSrc.match(/([^/]+)\.jpg/)[1]; | ||
ZU.doGet( | ||
`/imageviewer/api/media/token?dbId=${dbId}&imageId=${imageId}`, | ||
function (respText) { | ||
try { | ||
let json = JSON.parse(respText); | ||
item.attachments.push({ | ||
title: 'Census Record', | ||
mimeType: 'image/jpeg', | ||
url: json.imageDownloadUrl | ||
}); | ||
item.complete(); | ||
} | ||
catch (_) { | ||
item.complete(); // whatever, this is fragile | ||
} | ||
link = "http://search.ancestry.com/cgi-bin/sse.dll?indiv=1&db="+db+"&fh=0&h="+recid; | ||
name = doc.evaluate('.//span[@class="srchHit"]', listElt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | ||
items[link] = Zotero.Utilities.trimInternal(name); | ||
} | ||
|
||
items = Zotero.selectItems(items); | ||
if (!items) return true; | ||
|
||
var urls = new Array(); | ||
for (var i in items) { | ||
urls.push(i); | ||
} | ||
|
||
Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); }); | ||
Zotero.wait(); | ||
); | ||
} | ||
|
||
/** BEGIN TEST CASES **/ | ||
var testCases = [ | ||
{ | ||
"type": "web", | ||
"url": "https://www.ancestry.com/discoveryui-content/view/131479739:2442?tid=&pid=&queryId=2a5ea51171527460c8a3755eb4b3fc1e&_phsrc=BYN5&_phstart=successSource", | ||
"items": [ | ||
{ | ||
"itemType": "bookSection", | ||
"title": "Albert Einstein", | ||
"creators": [], | ||
"date": "1940", | ||
"archive": "Ancestry.com", | ||
"bookTitle": "1940 United States Federal Census [Princeton, Mercer, New Jersey]", | ||
"libraryCatalog": "Ancestry.com US Federal Census", | ||
"pages": "10B (roll m-t0627-02357)", | ||
"publisher": "National Archives and Records Administration", | ||
"url": "https://www.ancestry.com/discoveryui-content/view/131479739:2442", | ||
"attachments": [ | ||
{ | ||
"title": "Census Record", | ||
"mimeType": "image/jpeg" | ||
} | ||
], | ||
"tags": [], | ||
"notes": [ | ||
{ | ||
"note": "<table id=\"recordServiceData\" class=\"table tableHorizontal tableHorizontalRuled\"> <tbody> <tr> <th>Name:</th> <td> Albert Einstein </td> </tr> <tr> <th>Respondent:</th> <td> Yes </td> </tr> <tr> <th>Age:</th> <td> 61 </td> </tr> <tr> <th>Estimated Birth Year:</th> <td> <span class=\"srchHit\"> <span title=\"Alternate values for this record\" class=\"altValue\">[abt 1879]</span> </span> <span title=\"This value was member submitted. Click to see details.\" class=\"altValue\"> [<button class=\"link correction\" data-tracking-event=\"content : correction clicked\">14 Mar 1879</button>] </span> </td> </tr> <tr> <th>Gender:</th> <td> Male </td> </tr> <tr> <th>Race:</th> <td> White </td> </tr> <tr> <th>Birthplace:</th> <td> Germany </td> </tr> <tr> <th>Marital Status:</th> <td> Widowed </td> </tr> <tr> <th>Relation to Head of House:</th> <td> Head </td> </tr> <tr> <th>Home in 1940:</th> <td> Princeton, Mercer, New Jersey </td> </tr> <tr> <th>Map of Home in 1940:</th> <td> <button type=\"button\" title=\"View map\" class=\"link mapLink\" data-modal-title=\"Princeton, Mercer, New Jersey\" data-place-names=\"Princeton,Mercer,New Jersey\" data-tracking-event=\"content : map link clicked\">Princeton, Mercer, New Jersey</button> </td> </tr> <tr> <th>Street:</th> <td> Mercer - Street </td> </tr> <tr> <th>House Number:</th> <td> 112 </td> </tr> <tr> <th>Farm:</th> <td> No </td> </tr> <tr> <th>Inferred Residence in 1935:</th> <td> Princeton, Mercer, New Jersey </td> </tr> <tr> <th>Residence in 1935:</th> <td> Princeton </td> </tr> <tr> <th>Resident on farm in 1935:</th> <td> No </td> </tr> <tr> <th>Citizenship:</th> <td> Having first papers </td> </tr> <tr> <th>Sheet Number:</th> <td> 10B </td> </tr> <tr> <th>Number of Household in Order of Visitation:</th> <td> 267 </td> </tr> <tr> <th>Occupation:</th> <td> Pychies Professor </td> </tr> <tr> <th>Industry:</th> <td> Private School </td> </tr> <tr> <th>House Owned or Rented:</th> <td> Owned </td> </tr> <tr> <th>Value of Home or Monthly Rental if Rented:</th> <td> 22000 </td> </tr> <tr> <th>Attended School or College:</th> <td> No </td> </tr> <tr> <th>Highest Grade Completed:</th> <td> College, 5th or subsequent year </td> </tr> <tr> <th>Hours Worked Week Prior to Census:</th> <td> 44 </td> </tr> <tr> <th>Class of Worker:</th> <td> Wage or salary worker in private work </td> </tr> <tr> <th>Weeks Worked in 1939:</th> <td> 52 </td> </tr> <tr> <th>Income:</th> <td> 5000 </td> </tr> <tr> <th>Income Other Sources:</th> <td> Yes </td> </tr> <tr> <th>Neighbors:</th> <td> <button type=\"button\" title=\"View others on page\" class=\"link neighborsLink\" data-modal-title=\"View others on page\" data-image-gid=\"m-t0627-02357-00675:2442\" data-tracking-event=\"content : neighbors link clicked\">View others on page</button> </td> </tr> </tbody> </table>" | ||
} | ||
], | ||
"seeAlso": [] | ||
} | ||
] | ||
}, | ||
{ | ||
"type": "web", | ||
"url": "https://www.ancestry.com/discoveryui-content/view/18443183:7884?tid=&pid=&queryId=283135001368664572d798e1a9012c06&_phsrc=oJW436&_phstart=successSource", | ||
"items": [ | ||
{ | ||
"itemType": "bookSection", | ||
"title": "Pauline Rosenboom", | ||
"creators": [], | ||
"date": "1910", | ||
"archive": "Ancestry.com", | ||
"bookTitle": "1910 United States Federal Census [Bronx Assembly District 34, New York, New York]", | ||
"libraryCatalog": "Ancestry.com US Federal Census", | ||
"pages": "4A (roll T624_1001)", | ||
"publisher": "National Archives and Records Administration", | ||
"url": "https://www.ancestry.com/discoveryui-content/view/18443183:7884", | ||
"attachments": [ | ||
{ | ||
"title": "Census Record", | ||
"mimeType": "image/jpeg" | ||
} | ||
], | ||
"tags": [], | ||
"notes": [ | ||
{ | ||
"note": "<table id=\"recordServiceData\" class=\"table tableHorizontal tableHorizontalRuled\"> <tbody> <tr> <th>Name:</th> <td> <span class=\"srchHit\">Pauline Rosenboom <span title=\"Alternate name for this record\" class=\"altValue\">[Pauline Rosenbaum]</span> </span> </td> </tr> <tr> <th>Age in 1910:</th> <td> 51 </td> </tr> <tr> <th>Birth Date:</th> <td> <span class=\"srchHit\">1859 <span title=\"Alternate date for this record\" class=\"altValue\">[1859]</span> </span> </td> </tr> <tr> <th>Birthplace:</th> <td> Austria </td> </tr> <tr> <th>Home in 1910:</th> <td> Bronx Assembly District 34, New York, New York, USA </td> </tr> <tr> <th>Street:</th> <td> est Clenton Ave </td> </tr> <tr> <th>Race:</th> <td> White </td> </tr> <tr> <th>Gender:</th> <td> Female </td> </tr> <tr> <th>Immigration Year:</th> <td> 1887 </td> </tr> <tr> <th>Relation to Head of House:</th> <td> Mother-in-law </td> </tr> <tr> <th>Marital Status:</th> <td> Widowed </td> </tr> <tr> <th>Father's Birthplace:</th> <td> Austria </td> </tr> <tr> <th>Mother's Birthplace:</th> <td> Austria </td> </tr> <tr> <th>Native Tongue:</th> <td> English </td> </tr> <tr> <th>Attended School:</th> <td> No </td> </tr> <tr> <th>Able to read:</th> <td> Yes </td> </tr> <tr> <th>Able to Write:</th> <td> Yes </td> </tr> <tr> <th>Number of Children Born:</th> <td> 7 </td> </tr> <tr> <th>Number of Children Living:</th> <td> 5 </td> </tr> <tr> <th>Neighbors:</th> <td> <button type=\"button\" title=\"View others on page\" class=\"link neighborsLink\" data-modal-title=\"View others on page\" data-image-gid=\"4450082_00484:7884\" data-tracking-event=\"content : neighbors link clicked\">View others on page</button> </td> </tr> </tbody> </table>" | ||
} | ||
], | ||
"seeAlso": [] | ||
} | ||
] | ||
} | ||
} | ||
] | ||
/** END TEST CASES **/ |