Skip to content

Commit

Permalink
Embedded metadata - deal with quirky handling of URLs and item types.…
Browse files Browse the repository at this point in the history
… Boston Globe: fix authors. Rest - test updates
  • Loading branch information
adam3smith committed May 14, 2012
1 parent 995b79a commit 862b215
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 136 deletions.
9 changes: 2 additions & 7 deletions Annual Reviews.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
"priority": 200,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2012-04-10 00:10:21"
"browserSupport": "gcsib",
"lastUpdated": "2012-05-13 22:55:57"
}

/**
Expand Down Expand Up @@ -140,11 +140,6 @@ var testCases = [
"url": "http://www.annualreviews.org/journal/biophys",
"items": "multiple"
},
{
"type": "web",
"url": "http://www.annualreviews.org/toc/biophys/forthcoming",
"items": "multiple"
},
{
"type": "web",
"url": "http://www.annualreviews.org/toc/biophys/40/1",
Expand Down
6 changes: 4 additions & 2 deletions Embedded Metadata.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
"priority": 400,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsbv",
"lastUpdated": "2012-05-08 12:52:05"
"browserSupport": "gcsb",
"lastUpdated": "2012-05-13 23:44:30"
}

/*
Expand Down Expand Up @@ -494,6 +494,8 @@ function doWeb(doc, url) {
translator.setTranslator("5e3ad958-ac79-463d-812b-a86a9235c28f");
translator.setHandler("itemDone", function(obj, newItem) {
_haveItem = true;
if (_itemType) newItem.itemType = _itemType;
newItem.url = doc.location.href;
completeItem(doc, newItem);
});

Expand Down
110 changes: 2 additions & 108 deletions Galegroup.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcs",
"lastUpdated": "2012-05-10 13:23:58"
"lastUpdated": "2012-05-13 23:17:56"
}

/*
Expand Down Expand Up @@ -183,112 +183,6 @@ var testCases = [
}
]
},
{
"type": "web",
"url": "http://go.galegroup.com/ps/i.do?action=interpret&id=GALE%7CA221274499&v=2.1&u=viva_gmu&it=r&p=AONE&sw=w&authCount=1",
"items": [
{
"itemType": "journalArticle",
"creators": [
{
"lastName": "Aaronson",
"firstName": "Daniel",
"creatorType": "author"
},
{
"lastName": "Meckel",
"firstName": "Katherine",
"creatorType": "author"
}
],
"notes": [],
"tags": [
"Baby boom generation",
"Labor market",
"Labour market"
],
"seeAlso": [],
"attachments": [
{
"mimeType": "text/html",
"title": "Full Text (HTML)",
"downloadable": true
}
],
"publicationTitle": "Economic Perspectives",
"url": "http://go.galegroup.com/ps/i.do?id=GALE%7CA221274499&v=2.1&u=viva_gmu&it=r&p=AONE&sw=w",
"issue": "4",
"extra": "2",
"DOI": "Article",
"date": "2009",
"ISBN": "1048115X",
"ISSN": "1048115X",
"pages": "2+",
"title": "How will baby boomer retirements affect teacher labor markets?",
"volume": "33",
"accessDate": "May 7, 2012",
"language": "English",
"libraryCatalog": "Galegroup"
}
]
},
{
"type": "web",
"url": "http://go.galegroup.com/ps/i.do?action=interpret&id=GALE%7CA286390464&v=2.1&u=viva_gmu&it=r&p=EAIM&sw=w&authCount=1",
"items": [
{
"itemType": "journalArticle",
"creators": [
{
"lastName": "Aizen",
"firstName": "Marcelo A.",
"creatorType": "author"
},
{
"lastName": "Sabatino",
"firstName": "Matena",
"creatorType": "author"
},
{
"lastName": "Tylianakis",
"firstName": "Jason M.",
"creatorType": "author"
}
],
"notes": [
{
"note": "<p>American Association for the Advancement of Science. Due to publisher request, Science cannot be reproduced until 360 days after the original publication date.</p>"
}
],
"tags": [
"Ecological restoration",
"Extinction (Biology)"
],
"seeAlso": [],
"attachments": [
{
"mimeType": "text/html",
"title": "Full Text (HTML)",
"downloadable": true
}
],
"publicationTitle": "Science",
"url": "http://go.galegroup.com/ps/i.do?id=GALE%7CA286390464&v=2.1&u=viva_gmu&it=r&p=EAIM&sw=w",
"issue": "6075",
"extra": "1486",
"DOI": "Report",
"date": "2012",
"ISBN": "00368075",
"ISSN": "00368075",
"pages": "1486+",
"title": "Specialization and rarity predict nonrandom loss of interactions from mutualist networks",
"volume": "335",
"accessDate": "May 7, 2012",
"language": "English",
"libraryCatalog": "Galegroup"
}
]
},
{
"type": "web",
"url": "http://find.galegroup.com/ecco/infomark.do?&source=gale&prodId=ECCO&u=viva_gmu&tabID=T001&docId=CW3325179878&type=multipage&contentSet=ECCOArticles&version=1.0&docLevel=FASCIMILE",
Expand All @@ -308,7 +202,7 @@ var testCases = [
],
"title": "A digest of the law of actions and trials at nisi prius. By Isaac 'espinasse, of Gray's Inn, Esq. Barrister at Law. The third edition, corrected, with considerable additions from printed and manuscript cases. In two volumes. ...",
"place": "London",
"url": "http://find.galegroup.com/ecco/infomark.do?&source=gale&prodId=ECCO&u=viva_gmu&tabID=T001&docId=CW3325179878&type=multipage&contentSet=ECCOArticles&version=1.0",
"url": "http://find.galegroup.com/ecco/infomark.do?&source=gale&prodId=ECCO&userGroupName=viva_gmu&tabID=T001&docId=CW3325179878&type=multipage&contentSet=ECCOArticles&version=1.0",
"pages": "469",
"numPages": "469",
"DOI": "Monograph",
Expand Down
5 changes: 3 additions & 2 deletions Kommersant.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcs",
"lastUpdated": "2012-05-08 09:21:42"
"lastUpdated": "2012-05-13 23:34:22"
}

/* FW LINE 57:6869c32952b1 */ function flatten(c){var b=new Array();for(var d in c){var e=c[d];if(e instanceof Array){b=b.concat(flatten(e))}else{b.push(e)}}return b}var FW={_scrapers:new Array()};FW._Base=function(){this.callHook=function(b,c,e,a){if(typeof this["hooks"]==="object"){var d=this["hooks"][b];if(typeof d==="function"){d(c,e,a)}}};this.evaluateThing=function(f,e,c){var b=typeof f;if(b==="object"){if(f instanceof Array){var d=this.evaluateThing;var a=f.map(function(g){return d(g,e,c)});return flatten(a)}else{return f.evaluate(e,c)}}else{if(b==="function"){return f(e,c)}else{return f}}}};FW.Scraper=function(a){FW._scrapers.push(new FW._Scraper(a))};FW._Scraper=function(a){for(x in a){this[x]=a[x]}this._singleFieldNames=["abstractNote","applicationNumber","archive","archiveLocation","artworkMedium","artworkSize","assignee","audioFileType","audioRecordingType","billNumber","blogTitle","bookTitle","callNumber","caseName","code","codeNumber","codePages","codeVolume","committee","company","conferenceName","country","court","date","dateDecided","dateEnacted","dictionaryTitle","distributor","docketNumber","documentNumber","DOI","edition","encyclopediaTitle","episodeNumber","extra","filingDate","firstPage","forumTitle","genre","history","institution","interviewMedium","ISBN","ISSN","issue","issueDate","issuingAuthority","journalAbbreviation","label","language","legalStatus","legislativeBody","letterType","libraryCatalog","manuscriptType","mapType","medium","meetingName","nameOfAct","network","number","numberOfVolumes","numPages","pages","patentNumber","place","postType","presentationType","priorityNumbers","proceedingsTitle","programTitle","programmingLanguage","publicLawNumber","publicationTitle","publisher","references","reportNumber","reportType","reporter","reporterVolume","rights","runningTime","scale","section","series","seriesNumber","seriesText","seriesTitle","session","shortTitle","studio","subject","system","thesisType","title","type","university","url","version","videoRecordingType","volume","websiteTitle","websiteType"];this._makeAttachments=function(p,b,g,t){if(g instanceof Array){g.forEach(function(k){this._makeAttachments(p,b,k,t)},this)}else{if(typeof g==="object"){var o=g.urls||g.url;var m=g.types||g.type;var f=g.titles||g.title;var q=g.snapshots||g.snapshot;var j=this.evaluateThing(o,p,b);var n=this.evaluateThing(f,p,b);var s=this.evaluateThing(m,p,b);var d=this.evaluateThing(q,p,b);if(!(j instanceof Array)){j=[j]}for(var l in j){var c=j[l];var h;var e;var r;if(s instanceof Array){h=s[l]}else{h=s}if(n instanceof Array){e=n[l]}else{e=n}if(d instanceof Array){r=d[l]}else{r=d}t.attachments.push({url:c,title:e,type:h,snapshot:r})}}}};if(this.itemTrans!==undefined){this.makeItems=this.itemTrans.makeItems}else{this.makeItems=function(o,b,m,c,l){var q=new Zotero.Item(this.itemType);q.url=b;for(var h in this._singleFieldNames){var n=this._singleFieldNames[h];if(this[n]){var g=this.evaluateThing(this[n],o,b);if(g instanceof Array){q[n]=g[0]}else{q[n]=g}}}var r=["creators","tags"];for(var f in r){var p=r[f];var d=this.evaluateThing(this[p],o,b);if(d){for(var e in d){q[p].push(d[e])}}}this._makeAttachments(o,b,this["attachments"],q);c(q,this,o,b);l([q])}}};FW._Scraper.prototype=new FW._Base;FW.MultiScraper=function(a){FW._scrapers.push(new FW._MultiScraper(a))};FW._MultiScraper=function(a){for(x in a){this[x]=a[x]}this._mkSelectItems=function(e,d){var b=new Object;for(var c in e){b[d[c]]=e[c]}return b};this._selectItems=function(d,c,e){var b=new Array();Zotero.selectItems(this._mkSelectItems(d,c),function(f){for(var g in f){b.push(g)}e(b)})};this._mkAttachments=function(g,d,f){var b=this.evaluateThing(this["attachments"],g,d);var c=new Object();if(b){for(var e in f){c[f[e]]=b[e]}}return c};this._makeChoices=function(f,p,c,d,h){if(f instanceof Array){f.forEach(function(k){this._makeTitlesUrls(k,p,c,d,h)},this)}else{if(typeof f==="object"){var m=f.urls||f.url;var e=f.titles||f.title;var n=this.evaluateThing(m,p,c);var j=this.evaluateThing(e,p,c);var l=(j instanceof Array);if(!(n instanceof Array)){n=[n]}for(var g in n){var b=n[g];var o;if(l){o=j[g]}else{o=j}h.push(b);d.push(o)}}}};this.makeItems=function(j,b,g,c,f){if(this.beforeFilter){var k=this.beforeFilter(j,b);if(k!=b){this.makeItems(j,k,g,c,f);return}}var e=[];var h=[];this._makeChoices(this["choices"],j,b,e,h);var d=this._mkAttachments(j,b,h);this._selectItems(e,h,function(m){if(!m){f([])}else{var l=[];var n=this.itemTrans;Zotero.Utilities.processDocuments(m,function(q){var p=q.documentURI;var o=n;if(o===undefined){o=FW.getScraper(q,p)}if(o===undefined){}else{o.makeItems(q,p,d[p],function(r){l.push(r);c(r,o,q,p)},function(){})}},function(){f(l)})}})}};FW._MultiScraper.prototype=new FW._Base;FW.DelegateTranslator=function(a){return new FW._DelegateTranslator(a)};FW._DelegateTranslator=function(a){for(x in a){this[x]=a[x]}this._translator=Zotero.loadTranslator(this.translatorType);this._translator.setTranslator(this.translatorId);this.makeItems=function(g,d,b,f,c){var e;Zotero.Utilities.HTTP.doGet(d,function(h){this._translator.setHandler("itemDone",function(k,j){e=j;if(b){j.attachments=b}});if(this.preProcess){h=this.preProcess(h)}this._translator.setString(h);this._translator.translate();f(e)},function(){c([e])})}};FW.DelegateTranslator.prototype=new FW._Scraper;FW._StringMagic=function(){this._filters=new Array();this.addFilter=function(a){this._filters.push(a);return this};this.split=function(a){return this.addFilter(function(b){return b.split(a).filter(function(c){return(c!="")})})};this.replace=function(c,b,a){return this.addFilter(function(d){if(d.match(c)){return d.replace(c,b,a)}else{return d}})};this.prepend=function(a){return this.replace(/^/,a)};this.append=function(a){return this.replace(/$/,a)};this.remove=function(b,a){return this.replace(b,"",a)};this.trim=function(){return this.addFilter(function(a){return Zotero.Utilities.trim(a)})};this.trimInternal=function(){return this.addFilter(function(a){return Zotero.Utilities.trimInternal(a)})};this.match=function(a,b){if(!b){b=0}return this.addFilter(function(d){var c=d.match(a);if(c===undefined||c===null){return undefined}else{return c[b]}})};this.cleanAuthor=function(b,a){return this.addFilter(function(c){return Zotero.Utilities.cleanAuthor(c,b,a)})};this.key=function(a){return this.addFilter(function(b){return b[a]})};this.capitalizeTitle=function(){if(arguments.length>0&&arguments[0]==true){return this.addFilter(function(a){return Zotero.Utilities.capitalizeTitle(a,true)})}else{return this.addFilter(function(a){return Zotero.Utilities.capitalizeTitle(a)})}};this.unescapeHTML=function(){return this.addFilter(function(a){return Zotero.Utilities.unescapeHTML(a)})};this.unescape=function(){return this.addFilter(function(a){return unescape(a)})};this._applyFilters=function(c,e){for(i in this._filters){c=flatten(c);c=c.filter(function(a){return((a!==undefined)&&(a!==null))});for(var d=0;d<c.length;d++){try{if((c[d]===undefined)||(c[d]===null)){continue}else{c[d]=this._filters[i](c[d],e)}}catch(b){c[d]=undefined;Zotero.debug("Caught exception "+b+"on filter: "+this._filters[i])}}c=c.filter(function(a){return((a!==undefined)&&(a!==null))})}return flatten(c)}};FW.PageText=function(){return new FW._PageText()};FW._PageText=function(){this._filters=new Array();this.evaluate=function(c){var b=[c.documentElement.innerHTML];b=this._applyFilters(b,c);if(b.length==0){return false}else{return b}}};FW._PageText.prototype=new FW._StringMagic();FW.Url=function(){return new FW._Url()};FW._Url=function(){this._filters=new Array();this.evaluate=function(d,c){var b=[c];b=this._applyFilters(b,d);if(b.length==0){return false}else{return b}}};FW._Url.prototype=new FW._StringMagic();FW.Xpath=function(a){return new FW._Xpath(a)};FW._Xpath=function(a){this._xpath=a;this._filters=new Array();this.text=function(){var b=function(c){if(typeof c==="object"&&c.textContent){return c.textContent}else{return c}};this.addFilter(b);return this};this.sub=function(b){var c=function(f,e){var d=e.evaluate(b,f,null,XPathResult.ANY_TYPE,null);if(d){return d.iterateNext()}else{return undefined}};this.addFilter(c);return this};this.evaluate=function(f){var e=f.evaluate(this._xpath,f,null,XPathResult.ANY_TYPE,null);var d=e.resultType;var c=new Array();if(d==XPathResult.STRING_TYPE){c.push(e.stringValue)}else{if(d==XPathResult.ORDERED_NODE_ITERATOR_TYPE||d==XPathResult.UNORDERED_NODE_ITERATOR_TYPE){var b;while((b=e.iterateNext())){c.push(b)}}}c=this._applyFilters(c,f);if(c.length==0){return false}else{return c}}};FW._Xpath.prototype=new FW._StringMagic();FW.detectWeb=function(e,b){for(var c in FW._scrapers){var d=FW._scrapers[c];var f=d.evaluateThing(d.itemType,e,b);var a=d.evaluateThing(d.detect,e,b);if(a.length>0&&a[0]){return f}}return undefined};FW.getScraper=function(b,a){var c=FW.detectWeb(b,a);return FW._scrapers.filter(function(d){return(d.evaluateThing(d.itemType,b,a)==c)&&(d.evaluateThing(d.detect,b,a))})[0]};FW.doWeb=function(c,a){var b=FW.getScraper(c,a);b.makeItems(c,a,[],function(f,e,g,d){e.callHook("scraperDone",f,g,d);if(!f.title){f.title=""}f.complete()},function(){Zotero.done()});Zotero.wait()};
Expand All @@ -24,6 +24,7 @@




/*
Kommersant Translator
Copyright (C) 2011 Avram Lyon, [email protected]
Expand Down Expand Up @@ -170,7 +171,7 @@ var testCases = [
"abstractNote": "За тем, как проходят российские выборы в месте, где административный ресурс по географическим причинам ослаблен, наблюдал корреспондент \"Власти\" Артем Платов.",
"date": "2011-12-12",
"issue": "49 (953)",
"publicationTitle": "Коммерсантъ",
"publicationTitle": "Коммерсантъ Власть",
"title": "Яблочный пуй",
"libraryCatalog": "Kommersant",
"accessDate": "CURRENT_TIMESTAMP"
Expand Down
35 changes: 18 additions & 17 deletions The Boston Globe.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
{
"translatorID": "1f245496-4c1b-406a-8641-d286b3888231",
"label": "The Boston Globe",
"creator": "Adam Crymble and Frank Bennett",
"creator": "Adam Crymble, Frank Bennett, Sebastian Karcher",
"target": "^http://(www|search|articles)\\.boston\\.com/",
"minVersion": "1.0.0b4.r5",
"minVersion": "2.1.9",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcs",
"lastUpdated": "2012-05-06 16:00:45"
"lastUpdated": "2012-05-14 00:01:12"
}

/*
Expand All @@ -27,14 +27,11 @@
*/

function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
}: null;


if (url.match("search.boston.com")) {
// Search disabled until cross-domain can be dealt with
return false;
var results = doc.evaluate('//div[@class="resultsMain"]//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
var results = doc.evaluate('//div[@class="resultsMain"]//div[@class="regTZ"]/a[@class="titleLink"]', doc, null, XPathResult.ANY_TYPE, null);
if (results.iterateNext()) {
return "multiple";
} else {
Expand Down Expand Up @@ -160,6 +157,7 @@ function scrape (doc, url) {
}

// Authors
/*
for (var i = 0, ilen = infoElem.childNodes.length; i < ilen; i += 1) {
var node = infoElem.childNodes.item(i);
if (node.nodeName === 'SPAN') {
Expand All @@ -173,6 +171,13 @@ function scrape (doc, url) {
}
}
}
}*/

var authors = ZU.xpathText(infoElem, './span[@class="separator"]/following-sibling::span')
authors = authors.replace(/^\s*[Bb]y|,.+?$/g, "").trim();
author = authors.split(/ and |\s*,\s*/)
for (var i in author){
newItem.creators.push(ZU.cleanAuthor(author[i], "author"));
}

// Title
Expand All @@ -189,15 +194,12 @@ function scrape (doc, url) {


function doWeb (doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
}: null;


var uris= new Array();

if (detectWeb(doc, url) == "multiple") {
var items = {};
var result = doc.evaluate('//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
var result = doc.evaluate('//div[@class="regTZ"]/a[@class="titleLink"]', doc, null, XPathResult.ANY_TYPE, null);
var elmt = result.iterateNext();
while (elmt) {
//items.push(elmt.href);
Expand Down Expand Up @@ -256,7 +258,7 @@ var testCases = [
},
{
"type": "web",
"url": "http://www.boston.com/news/nation/articles/2012/05/06/2_ny_cooperators_give_firsthand_look_at_al_qaida/",
"url": "http://articles.boston.com/2012-05-06/news/31599524_1_qaida-khalid-sheik-mohammed-medunjanin",
"items": [
{
"itemType": "newspaperArticle",
Expand All @@ -283,9 +285,8 @@ var testCases = [
}
],
"publicationTitle": "Boston.com",
"url": "http://www.boston.com/news/nation/articles/2012/05/06/2_ny_cooperators_give_firsthand_look_at_al_qaida/",
"title": "2 NY cooperators give firsthand look at al-Qaida",
"date": "May 6, 2012",
"url": "http://articles.boston.com/2012-05-06/news/31599524_1_qaida-khalid-sheik-mohammed-medunjanin",
"title": "Cooperators give firsthand look at al-Qaida",
"libraryCatalog": "The Boston Globe",
"accessDate": "CURRENT_TIMESTAMP"
}
Expand Down

0 comments on commit 862b215

Please sign in to comment.