Skip to content

Commit

Permalink
Rewrite Ancestry.com Federal Census for current site (#2582)
Browse files Browse the repository at this point in the history
  • Loading branch information
AbeJellinek authored Jul 26, 2021
1 parent edfee55 commit 59facf0
Showing 1 changed file with 137 additions and 210 deletions.
347 changes: 137 additions & 210 deletions Ancestry.com US Federal Census.js
Original file line number Diff line number Diff line change
@@ -1,237 +1,164 @@
{
"translatorID": "0dda3f89-15de-4479-987f-cc13f1ba7999",
"label": "Ancestry.com US Federal Census",
"creator": "Elena Razlogova",
"target": "^https?://search\\.ancestry\\.com/.*(usfedcen|1890orgcen|1910uscenindex)",
"minVersion": "1.0.0b4.r1",
"creator": "Abe Jellinek",
"target": "^https?://(www\\.)?ancestry\\.com/",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "g",
"lastUpdated": "2015-06-02 10:57:09"
"browserSupport": "gcibv",
"lastUpdated": "2021-07-22 19:20:28"
}

function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;

var result = doc.evaluate('//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]', doc, nsResolver,
XPathResult.ANY_TYPE, null).iterateNext();
/*
***** BEGIN LICENSE BLOCK *****
var rows = doc.evaluate('//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]/table/tbody/tr[@class="tblrow record"]',
doc, nsResolver, XPathResult.ANY_TYPE, null);
var row;
while (row = rows.iterateNext()) {
links = doc.evaluate('.//a', row, nsResolver, XPathResult.ANY_TYPE, null);
var linkNo=0;
while (link=links.iterateNext()) {
linkNo=linkNo+1;
}
break;
}
Copyright © 2021 Abe Jellinek
if (result && linkNo == 2) {
return "multiple";
} else {
var indivRe = /indiv=1/;
var m = indivRe.exec(doc.location.href);
var indiv = 0;
if (m) {
indiv = 1;
}
This file is part of Zotero.
checkURL = doc.location.href.replace("pf=", "");
if (doc.location.href == checkURL && indiv == 1) {
return "bookSection";
}
}
}
Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
// this US Federal Census scraper is a hack - so far there is no proper item type in Zotero for this kind of data (added to trac as a low priority ticket)
// this scraper creates proper citation for the census as a whole (should be cited as book)
// but also adds name, city, and state for a particular individual to the citation to make scanning for names & places easier in the middle pane
// (that's why the resulting item type is a book section)
// it also adds all searchable text as a snapshot and a scan of the census record as an image
Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
function scrape(doc) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;

// get initial census data; a proper census record item type should have separate fields for all of these except perhaps dbid
var info = doc.evaluate('//div[@class="facets"][@id="connect"]/div[@class="g_box"]/p/a',
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();

if (info) {

info = info.toString();
var data = new Array();
var parts = info.split(/[?&]/);
for (var i=0; i<parts.length; i++) {
var part = parts[i];
var index = part.indexOf("=");
if (index !== -1) {
data[part.substr(0, index)] = part.substr(index+1);
}
}

if (data.ln) {
var lastName = data.ln.replace(/\+/g, " ");
var firstName = data.fn.replace(/\+/g, " ");
} else {
var lastName = data.fn.replace(/\+/g, " ");
var firstName = "";
}
var dOb = data.by; // this does not get saved yet because no field is available; the info is in the snapshot
if (data.rfd) {
var yearRe = /([0-9]{4})/;
var m = yearRe.exec(data.rfd);
if (m) {
var year = m[1];
}
} else { var year = data.ry; }
var state = data.rs.replace(/\+/g, " ");
var county = data.rcnty.replace(/\+/g, " "); // this does not get saved yet because no field is available; the info is in the snapshot
var city = data.rcty.replace(/\+/g, " ");
var dbid = data.dbid;
}

// set census number for citation - let me know if this can be done in a better way
var censusYear = 0;
var censusNo = "";
var censusNos = new Array("1790", "First", "1800", "Second", "1810", "Third", "1820", "Fourth", "1830", "Fifth", "1840", "Sixth", "1850", "Seventh", "1860", "Eighth", "1870", "Ninth",
"1880", "Tenth", "1890", "Eleventh", "1900", "Twelfth", "1910", "Thirteenth", "1920", "Fourteenth", "1930", "Fifteenth")
for (var i in censusNos) {
if (censusYear == 1) { censusNo = censusNos[i] };
if (censusNos[i] == year) { censusYear = 1 } else {censusYear= 0 };
}
You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
//begin adding item
var newItem = new Zotero.Item("bookSection");
newItem.title = city+", "+state; // this is not proper citation but is needed to easily scan for placenames in middle pane
newItem.publicationTitle = censusNo+" Census of the United States, "+year;
newItem.publisher = "National Archives and Records Administration";
newItem.place = "Washington, DC";
newItem.date = year;

// get snapshot with all searchable text and a simplified link to the record for the URL field
var dbRe = /db=([0-9a-z]+)/;
var m = dbRe.exec(doc.location.href);
if (m) {
db = m[1];
}
var snapshotRe = /\&h=([0-9]+)/;
var m = snapshotRe.exec(doc.location.href);
if (m) {
snapshotURL = "http://search.ancestry.com/cgi-bin/sse.dll?db="+db+"&indiv=1&pf=1&h="+m[1];
newItem.attachments.push({title:"Ancestry.com Snapshot", mimeType:"text/html", url:snapshotURL, snapshot:true});
cleanURL = "http://search.ancestry.com/cgi-bin/sse.dll?indiv=1&db="+db+"&fh=0&h="+m[1];
newItem.url = cleanURL;
}

// add particular individual being surveyed as contributor - this is not proper citation but is needed so one could easily scan for names in middle pane
var creator = new Array();
creator.firstName = firstName;
creator.lastName = lastName;
creator.creatorType = "author";
newItem.creators.push(creator);

//add proper author for citation
var creator = new Array();
creator.lastName = "United States of America, Bureau of the Census";
creator.creatorType = "contributor";
newItem.creators.push(creator);
***** END LICENSE BLOCK *****
*/

// get scan of the census image
var scanInfo = doc.evaluate('//div[@id="record-main"]/table[@class="p_recTable"]/tbody/tr/td[2][@class="recordTN"]/a',
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();

if (scanInfo) {
var scanRe = /iid=([A-Z0-9_-]+)/;
var m = scanRe.exec(scanInfo);
if (m) {
scanURL = "http://content.ancestry.com/Browse/print_u.aspx?dbid="+dbid+"&iid="+m[1];
Zotero.debug("scan url: " + scanURL);
}
}

if (scanURL){
Zotero.Utilities.HTTP.doGet(scanURL, function(text) {
Zotero.debug("running doGet");
Zotero.debug(text);
var imageRe = /950 src="([^"]+)"/;
var m = imageRe.exec(text);
if (m) {
imageURL = m[1];
Zotero.debug("image url: " + imageURL);
newItem.attachments.push({title:"Ancestry.com Image", mimeType:"image/jpeg", url:imageURL, snapshot:true});
}

newItem.complete();
Zotero.done();
});
} else {
newItem.complete();
Zotero.done();

function detectWeb(doc, url) {
if (text(doc, '.pageTitle .pageIntro a').includes('United States Federal Census')) {
return "bookSection";
}
return false;
}

function doWeb(doc, url) {
var resultsRegexp = /&h=/;
if (resultsRegexp.test(url)) {
scrape(doc);
} else {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
scrape(doc, url);
}

function scrape(doc, url) {
let item = new Zotero.Item('bookSection');

item.title = text(doc, '.pageTitle span');
let [date, place, roll, page] = doc.querySelectorAll('.sourceText em'); // not ideal
item.bookTitle = text(doc, '.pageTitle .pageIntro a').trim()
+ ` [${place.textContent}]`;
item.publisher = 'National Archives and Records Administration';
// technically the Census is published 72 years after it's taken, but citing
// that way doesn't seem to be the convention.
item.date = date.textContent;
item.pages = `${page.textContent} (roll ${roll.textContent})`;
item.archive = 'Ancestry.com';
item.url = url.replace(/[?#].*/, '');

let recordTable = doc.querySelector('#recordServiceData');
if (recordTable) {
recordTable = recordTable.cloneNode(true);

// get census year for links to items
var yearRe = /db=([0-9]+)/;
var m = yearRe.exec(doc.location.href);
if (m) {
year = m[1];
}
let familyMembers = recordTable.querySelector('.tableContainerRow');
if (familyMembers) familyMembers.remove();

var dbRe = /db=([0-9a-z]+)/;
var m = dbRe.exec(doc.location.href);
if (m) {
db = m[1];
}
item.notes.push({
note: ZU.trimInternal(recordTable.outerHTML)
});
}

//select items
var items = new Array();
var listElts = doc.evaluate('//tr[@class="tblrow record keySelect"] | //tr[@class="tblrow record"] | //tr[@class="tblrowalt record"]',
doc, nsResolver, XPathResult.ANY_TYPE, null);
var recid;
var link;
var name;
while (listElt = listElts.iterateNext()) {
recInfo = doc.evaluate('.//a', listElt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var recidRe = /recid=([0-9]+)/;
var m = recidRe.exec(recInfo);
if (m) {
recid = m[1];
let imageSrc = attr(doc, '.photo.clickable img', 'src');
let dbId = imageSrc.match(/\/namespaces\/([^/]+)/)[1];
let imageId = imageSrc.match(/([^/]+)\.jpg/)[1];
ZU.doGet(
`/imageviewer/api/media/token?dbId=${dbId}&imageId=${imageId}`,
function (respText) {
try {
let json = JSON.parse(respText);
item.attachments.push({
title: 'Census Record',
mimeType: 'image/jpeg',
url: json.imageDownloadUrl
});
item.complete();
}
catch (_) {
item.complete(); // whatever, this is fragile
}
link = "http://search.ancestry.com/cgi-bin/sse.dll?indiv=1&db="+db+"&fh=0&h="+recid;
name = doc.evaluate('.//span[@class="srchHit"]', listElt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
items[link] = Zotero.Utilities.trimInternal(name);
}

items = Zotero.selectItems(items);
if (!items) return true;

var urls = new Array();
for (var i in items) {
urls.push(i);
}

Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); });
Zotero.wait();
);
}

/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "https://www.ancestry.com/discoveryui-content/view/131479739:2442?tid=&pid=&queryId=2a5ea51171527460c8a3755eb4b3fc1e&_phsrc=BYN5&_phstart=successSource",
"items": [
{
"itemType": "bookSection",
"title": "Albert Einstein",
"creators": [],
"date": "1940",
"archive": "Ancestry.com",
"bookTitle": "1940 United States Federal Census [Princeton, Mercer, New Jersey]",
"libraryCatalog": "Ancestry.com US Federal Census",
"pages": "10B (roll m-t0627-02357)",
"publisher": "National Archives and Records Administration",
"url": "https://www.ancestry.com/discoveryui-content/view/131479739:2442",
"attachments": [
{
"title": "Census Record",
"mimeType": "image/jpeg"
}
],
"tags": [],
"notes": [
{
"note": "<table id=\"recordServiceData\" class=\"table tableHorizontal tableHorizontalRuled\"> <tbody> <tr> <th>Name:</th> <td> Albert Einstein </td> </tr> <tr> <th>Respondent:</th> <td> Yes </td> </tr> <tr> <th>Age:</th> <td> 61 </td> </tr> <tr> <th>Estimated Birth Year:</th> <td> <span class=\"srchHit\"> <span title=\"Alternate values for this record\" class=\"altValue\">[abt 1879]</span> </span> <span title=\"This value was member submitted. Click to see details.\" class=\"altValue\"> [<button class=\"link correction\" data-tracking-event=\"content : correction clicked\">14 Mar 1879</button>] </span> </td> </tr> <tr> <th>Gender:</th> <td> Male </td> </tr> <tr> <th>Race:</th> <td> White </td> </tr> <tr> <th>Birthplace:</th> <td> Germany </td> </tr> <tr> <th>Marital Status:</th> <td> Widowed </td> </tr> <tr> <th>Relation to Head of House:</th> <td> Head </td> </tr> <tr> <th>Home in 1940:</th> <td> Princeton, Mercer, New Jersey </td> </tr> <tr> <th>Map of Home in 1940:</th> <td> <button type=\"button\" title=\"View map\" class=\"link mapLink\" data-modal-title=\"Princeton, Mercer, New Jersey\" data-place-names=\"Princeton,Mercer,New Jersey\" data-tracking-event=\"content : map link clicked\">Princeton, Mercer, New Jersey</button> </td> </tr> <tr> <th>Street:</th> <td> Mercer - Street </td> </tr> <tr> <th>House Number:</th> <td> 112 </td> </tr> <tr> <th>Farm:</th> <td> No </td> </tr> <tr> <th>Inferred Residence in 1935:</th> <td> Princeton, Mercer, New Jersey </td> </tr> <tr> <th>Residence in 1935:</th> <td> Princeton </td> </tr> <tr> <th>Resident on farm in 1935:</th> <td> No </td> </tr> <tr> <th>Citizenship:</th> <td> Having first papers </td> </tr> <tr> <th>Sheet Number:</th> <td> 10B </td> </tr> <tr> <th>Number of Household in Order of Visitation:</th> <td> 267 </td> </tr> <tr> <th>Occupation:</th> <td> Pychies Professor </td> </tr> <tr> <th>Industry:</th> <td> Private School </td> </tr> <tr> <th>House Owned or Rented:</th> <td> Owned </td> </tr> <tr> <th>Value of Home or Monthly Rental if Rented:</th> <td> 22000 </td> </tr> <tr> <th>Attended School or College:</th> <td> No </td> </tr> <tr> <th>Highest Grade Completed:</th> <td> College, 5th or subsequent year </td> </tr> <tr> <th>Hours Worked Week Prior to Census:</th> <td> 44 </td> </tr> <tr> <th>Class of Worker:</th> <td> Wage or salary worker in private work </td> </tr> <tr> <th>Weeks Worked in 1939:</th> <td> 52 </td> </tr> <tr> <th>Income:</th> <td> 5000 </td> </tr> <tr> <th>Income Other Sources:</th> <td> Yes </td> </tr> <tr> <th>Neighbors:</th> <td> <button type=\"button\" title=\"View others on page\" class=\"link neighborsLink\" data-modal-title=\"View others on page\" data-image-gid=\"m-t0627-02357-00675:2442\" data-tracking-event=\"content : neighbors link clicked\">View others on page</button> </td> </tr> </tbody> </table>"
}
],
"seeAlso": []
}
]
},
{
"type": "web",
"url": "https://www.ancestry.com/discoveryui-content/view/18443183:7884?tid=&pid=&queryId=283135001368664572d798e1a9012c06&_phsrc=oJW436&_phstart=successSource",
"items": [
{
"itemType": "bookSection",
"title": "Pauline Rosenboom",
"creators": [],
"date": "1910",
"archive": "Ancestry.com",
"bookTitle": "1910 United States Federal Census [Bronx Assembly District 34, New York, New York]",
"libraryCatalog": "Ancestry.com US Federal Census",
"pages": "4A (roll T624_1001)",
"publisher": "National Archives and Records Administration",
"url": "https://www.ancestry.com/discoveryui-content/view/18443183:7884",
"attachments": [
{
"title": "Census Record",
"mimeType": "image/jpeg"
}
],
"tags": [],
"notes": [
{
"note": "<table id=\"recordServiceData\" class=\"table tableHorizontal tableHorizontalRuled\"> <tbody> <tr> <th>Name:</th> <td> <span class=\"srchHit\">Pauline Rosenboom <span title=\"Alternate name for this record\" class=\"altValue\">[Pauline Rosenbaum]</span> </span> </td> </tr> <tr> <th>Age in 1910:</th> <td> 51 </td> </tr> <tr> <th>Birth Date:</th> <td> <span class=\"srchHit\">1859 <span title=\"Alternate date for this record\" class=\"altValue\">[1859]</span> </span> </td> </tr> <tr> <th>Birthplace:</th> <td> Austria </td> </tr> <tr> <th>Home in 1910:</th> <td> Bronx Assembly District 34, New York, New York, USA </td> </tr> <tr> <th>Street:</th> <td> est Clenton Ave </td> </tr> <tr> <th>Race:</th> <td> White </td> </tr> <tr> <th>Gender:</th> <td> Female </td> </tr> <tr> <th>Immigration Year:</th> <td> 1887 </td> </tr> <tr> <th>Relation to Head of House:</th> <td> Mother-in-law </td> </tr> <tr> <th>Marital Status:</th> <td> Widowed </td> </tr> <tr> <th>Father's Birthplace:</th> <td> Austria </td> </tr> <tr> <th>Mother's Birthplace:</th> <td> Austria </td> </tr> <tr> <th>Native Tongue:</th> <td> English </td> </tr> <tr> <th>Attended School:</th> <td> No </td> </tr> <tr> <th>Able to read:</th> <td> Yes </td> </tr> <tr> <th>Able to Write:</th> <td> Yes </td> </tr> <tr> <th>Number of Children Born:</th> <td> 7 </td> </tr> <tr> <th>Number of Children Living:</th> <td> 5 </td> </tr> <tr> <th>Neighbors:</th> <td> <button type=\"button\" title=\"View others on page\" class=\"link neighborsLink\" data-modal-title=\"View others on page\" data-image-gid=\"4450082_00484:7884\" data-tracking-event=\"content : neighbors link clicked\">View others on page</button> </td> </tr> </tbody> </table>"
}
],
"seeAlso": []
}
]
}
}
]
/** END TEST CASES **/

0 comments on commit 59facf0

Please sign in to comment.