Skip to content

Commit

Permalink
De-noise search result highlighting
Browse files Browse the repository at this point in the history
The logic was changed awhile back to use a prefix
match in JavaScript which made it extremely noisy
for short prefixes (e.g. “major h. b. ferguson”
would match every word starting with h or b).

This changes the logic to convert both terms to
lower-case, trims punctuation and leading/trailing
whitespace, and then tests for exact matches so
e.g. “h.” would match “H.”, “H”, “h!”, etc. but
not “How” or “oh”.

20a8cd8

3b18aa0
  • Loading branch information
acdha committed Feb 26, 2019
1 parent d341a39 commit ce5e9c3
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 10 deletions.
22 changes: 15 additions & 7 deletions core/static/js/page.js
Original file line number Diff line number Diff line change
Expand Up @@ -157,18 +157,26 @@
var params = $.deparam.fragment();
var words = params["words"] || "";

// FIXME: refactor this and search_pages_results.html to share a common implementation
var highlightNoiseRegEx = new RegExp(
/^[/.,/#!$%^&*;:{}=\-_`~()]+|[/.,/#!$%^&*;:{}=\-_`~()]+$|'s$/
);

$.getJSON(coordinates_url, function(all_coordinates) {
var scale = 1 / all_coordinates["width"];

$.each(words.split(" "), function(index, word) {
if (word != "") {
if (word) {
word = word.toLocaleLowerCase().trim();

for (var word_on_page in all_coordinates["coords"]) {
//check if the word on the page starts or ends with the word we are looking for
if (
word_on_page
.toLowerCase()
.indexOf(word.toLowerCase()) > -1
) {
var match_word = word_on_page
.toLocaleLowerCase()
.replace(highlightNoiseRegEx, " ")
.replace(/\s+/, " ")
.trim();

if (match_word === word) {
var coordinates =
all_coordinates["coords"][word_on_page];
if (coordinates !== undefined) {
Expand Down
14 changes: 11 additions & 3 deletions loc/templates/search_pages_results.html
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
that shifts the thumbnail down and to the right a couple pixels... so
we do the same with the highlights */

var highlightNoiseRegEx = new RegExp(/^[/.,/#!$%^&*;:{}=\-_`~()]+|[/.,/#!$%^&*;:{}=\-_`~()]+$|'s$/);

function add_highlights(div) {
var image = div.find(".thumbnail");
Expand All @@ -46,11 +47,18 @@
var vScale = height / all_coordinates["height"];
var hScale = width / all_coordinates["width"];
$.each(words.split(" "), function(index, word) {
for (word_on_page in all_coordinates["coords"]){
//check if the word on the page starts or ends with the word we are looking for
if(word_on_page.toLowerCase().indexOf(word.toLowerCase()) > -1 ){
if (word) {
// don't do anything if the word is blank
word = word.toLocaleLowerCase().trim();

for (word_on_page in all_coordinates["coords"]) {
match_word = word_on_page
.toLocaleLowerCase()
.replace(highlightNoiseRegEx, " ")
.replace(/\s+/, " ")
.trim();

if (match_word === word) {
var coordinates = all_coordinates["coords"][word_on_page];
for (k in coordinates) {
var v = coordinates[k];
Expand Down

0 comments on commit ce5e9c3

Please sign in to comment.