Skip to content

Commit

Permalink
Add edit distance matching for lookalike URLs using top 500 domains
Browse files Browse the repository at this point in the history
This is a follow up to crrev/1379195. It uses the .cc file generated in the previous CL to check if any top domains are within 1 edit distance of the navigated domain. The check is done as a linear search, as described in the design doc in the previous CL.

Bug: 913647
Change-Id: Ia78a079e786703678ef93c6f341138e15d074a6f
Reviewed-on: https://chromium-review.googlesource.com/c/1378973
Commit-Queue: Mustafa Emre Acer <[email protected]>
Reviewed-by: Tommy Li <[email protected]>
Cr-Commit-Position: refs/heads/master@{#617698}
  • Loading branch information
meacer authored and Commit Bot committed Dec 19, 2018
1 parent d3b21da commit 5b516e0
Show file tree
Hide file tree
Showing 12 changed files with 405 additions and 105 deletions.
2 changes: 2 additions & 0 deletions chrome/browser/ui/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,8 @@ jumbo_split_static_library("ui") {
"//components/update_client",
"//components/upload_list",
"//components/url_formatter",
"//components/url_formatter/top_domains:common",
"//components/url_formatter/top_domains:generate_top_domains_for_edit_distance",
"//components/user_manager",
"//components/user_prefs",
"//components/variations",
Expand Down
252 changes: 176 additions & 76 deletions chrome/browser/ui/omnibox/lookalike_url_navigation_observer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "components/omnibox/browser/autocomplete_match.h"
#include "components/ukm/content/source_url_recorder.h"
#include "components/url_formatter/idn_spoof_checker.h"
#include "components/url_formatter/top_domains/top_domain_util.h"
#include "components/url_formatter/url_formatter.h"
#include "content/public/browser/navigation_handle.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
Expand All @@ -24,6 +25,12 @@

namespace {

#include "components/url_formatter/top_domains/top500-domains-inc.cc"

using MatchType = LookalikeUrlNavigationObserver::MatchType;
using NavigationSuggestionEvent =
LookalikeUrlNavigationObserver::NavigationSuggestionEvent;

void RecordEvent(
LookalikeUrlNavigationObserver::NavigationSuggestionEvent event) {
UMA_HISTOGRAM_ENUMERATION(LookalikeUrlNavigationObserver::kHistogramName,
Expand All @@ -41,6 +48,71 @@ bool SkeletonsMatch(const url_formatter::Skeletons& skeletons1,
return false;
}

// Returns a site that the user has used before that the eTLD+1 in
// |domain_and_registry| may be attempting to spoof, based on skeleton
// comparison.
std::string GetMatchingSiteEngagementDomain(
SiteEngagementService* service,
const std::string& domain_and_registry) {
// Compute skeletons using eTLD+1.
// eTLD+1 can be empty for private domains.
if (domain_and_registry.empty())
return std::string();

url_formatter::IDNConversionResult result =
url_formatter::IDNToUnicodeWithDetails(domain_and_registry);
DCHECK(result.has_idn_component);
const url_formatter::Skeletons navigated_skeletons =
url_formatter::GetSkeletons(result.result);

std::map<std::string, url_formatter::Skeletons>
domain_and_registry_to_skeleton;
std::vector<mojom::SiteEngagementDetails> engagement_details =
service->GetAllDetails();
for (const auto& detail : engagement_details) {
// Ignore sites with an engagement score lower than LOW.
if (!service->IsEngagementAtLeast(detail.origin,
blink::mojom::EngagementLevel::MEDIUM))
continue;

// If the user has engaged with eTLD+1 of this site, don't show any
// lookalike navigation suggestions.
const std::string engaged_domain_and_registry =
net::registry_controlled_domains::GetDomainAndRegistry(
detail.origin,
net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
// eTLD+1 can be empty for private domains.
if (engaged_domain_and_registry.empty())
continue;

if (domain_and_registry == engaged_domain_and_registry)
return std::string();

// Multiple domains can map to the same eTLD+1, avoid skeleton generation
// when possible.
auto it = domain_and_registry_to_skeleton.find(engaged_domain_and_registry);
url_formatter::Skeletons skeletons;
if (it == domain_and_registry_to_skeleton.end()) {
// Engaged site can be IDN. Decode as unicode and compute the skeleton
// from that. At this point, top domain checks have already been done, so
// if the site is IDN, it'll always be decoded as unicode (i.e. IDN spoof
// checker will not find a matching top domain and fall back to punycode
// for it).
url_formatter::IDNConversionResult conversion_result =
url_formatter::IDNToUnicodeWithDetails(engaged_domain_and_registry);

skeletons = url_formatter::GetSkeletons(conversion_result.result);
domain_and_registry_to_skeleton[engaged_domain_and_registry] = skeletons;
} else {
skeletons = it->second;
}

if (SkeletonsMatch(navigated_skeletons, skeletons))
return detail.origin.host();
}
return std::string();
}

} // namespace

// static
Expand Down Expand Up @@ -76,25 +148,10 @@ void LookalikeUrlNavigationObserver::DidFinishNavigation(
if (service->IsEngagementAtLeast(url, blink::mojom::EngagementLevel::MEDIUM))
return;

const base::StringPiece host = url.host_piece();
url_formatter::IDNConversionResult result =
url_formatter::IDNToUnicodeWithDetails(host);
if (!result.has_idn_component)
return;

std::string matched_domain;
MatchType match_type;
if (result.matching_top_domain.empty()) {
matched_domain = GetMatchingSiteEngagementDomain(service, url);
if (matched_domain.empty())
return;
RecordEvent(NavigationSuggestionEvent::kMatchSiteEngagement);
match_type = MatchType::kSiteEngagement;
} else {
matched_domain = result.matching_top_domain;
RecordEvent(NavigationSuggestionEvent::kMatchTopSite);
match_type = MatchType::kTopSite;
}
if (!GetMatchingDomain(url, service, &matched_domain, &match_type))
return;

DCHECK(!matched_domain.empty());

Expand All @@ -119,80 +176,123 @@ void LookalikeUrlNavigationObserver::DidFinishNavigation(
}
}

// static
void LookalikeUrlNavigationObserver::CreateForWebContents(
content::WebContents* web_contents) {
DCHECK(web_contents);
if (!FromWebContents(web_contents)) {
web_contents->SetUserData(
UserDataKey(),
std::make_unique<LookalikeUrlNavigationObserver>(web_contents));
}
}

std::string LookalikeUrlNavigationObserver::GetMatchingSiteEngagementDomain(
bool LookalikeUrlNavigationObserver::GetMatchingDomain(
const GURL& url,
SiteEngagementService* service,
const GURL& url) {
// Compute skeletons using eTLD+1.
std::string* matched_domain,
MatchType* match_type) {
// Perform all computations on eTLD+1.
const std::string domain_and_registry =
net::registry_controlled_domains::GetDomainAndRegistry(
url, net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
// eTLD+1 can be empty for private domains.
if (domain_and_registry.empty())
return std::string();

url_formatter::IDNConversionResult result =
url_formatter::IDNToUnicodeWithDetails(domain_and_registry);
DCHECK(result.has_idn_component);
const url_formatter::Skeletons navigated_skeletons =
url_formatter::GetSkeletons(result.result);

std::map<std::string, url_formatter::Skeletons>
domain_and_registry_to_skeleton;
std::vector<mojom::SiteEngagementDetails> engagement_details =
service->GetAllDetails();
for (const auto& detail : engagement_details) {
// Ignore sites with an engagement score lower than LOW.
if (!service->IsEngagementAtLeast(detail.origin,
blink::mojom::EngagementLevel::MEDIUM))
continue;

// If the user has engaged with eTLD+1 of this site, don't show any
// lookalike navigation suggestions.
const std::string engaged_domain_and_registry =
net::registry_controlled_domains::GetDomainAndRegistry(
detail.origin,
net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
// eTLD+1 can be empty for private domains.
if (engaged_domain_and_registry.empty())
continue;
if (result.has_idn_component) {
// If the navigated domain is IDN, check its skeleton against top domains
// and engaged sites.
if (!result.matching_top_domain.empty()) {
RecordEvent(NavigationSuggestionEvent::kMatchTopSite);
*matched_domain = result.matching_top_domain;
*match_type = MatchType::kTopSite;
return true;
}

if (domain_and_registry == engaged_domain_and_registry)
return std::string();
const std::string matched_engaged_domain =
GetMatchingSiteEngagementDomain(service, domain_and_registry);
if (!matched_engaged_domain.empty()) {
RecordEvent(NavigationSuggestionEvent::kMatchSiteEngagement);
*matched_domain = matched_engaged_domain;
*match_type = MatchType::kSiteEngagement;
return true;
}
}

// Multiple domains can map to the same eTLD+1, avoid skeleton generation
// when possible.
auto it = domain_and_registry_to_skeleton.find(engaged_domain_and_registry);
url_formatter::Skeletons skeletons;
if (it == domain_and_registry_to_skeleton.end()) {
// Engaged site can be IDN. Decode as unicode and compute the skeleton
// from that. At this point, top domain checks have already been done, so
// if the site is IDN, it'll always be decoded as unicode (i.e. IDN spoof
// checker will not find a matching top domain and fall back to punycode
// for it).
url_formatter::IDNConversionResult conversion_result =
url_formatter::IDNToUnicodeWithDetails(engaged_domain_and_registry);
// If we can't find an exact top domain or an engaged site, try to find a top
// domain within an edit distance of one.
const std::string similar_domain =
GetSimilarDomainFromTop500(base::UTF16ToUTF8(result.result));
if (!similar_domain.empty() && domain_and_registry != similar_domain) {
RecordEvent(NavigationSuggestionEvent::kMatchEditDistance);
*matched_domain = similar_domain;
*match_type = MatchType::kEditDistance;
return true;
}
return false;
}

skeletons = url_formatter::GetSkeletons(conversion_result.result);
domain_and_registry_to_skeleton[engaged_domain_and_registry] = skeletons;
// static
bool LookalikeUrlNavigationObserver::IsEditDistanceAtMostOne(
const base::string16& str1,
const base::string16& str2) {
if (str1.size() > str2.size() + 1 || str2.size() > str1.size() + 1) {
return false;
}
base::string16::const_iterator i = str1.begin();
base::string16::const_iterator j = str2.begin();
size_t edit_count = 0;
while (i != str1.end() && j != str2.end()) {
if (*i == *j) {
i++;
j++;
} else {
skeletons = it->second;
edit_count++;
if (edit_count > 1) {
return false;
}

if (str1.size() > str2.size()) {
// First string is longer than the second. This can only happen if the
// first string has an extra character.
i++;
} else if (str2.size() > str1.size()) {
// Second string is longer than the first. This can only happen if the
// second string has an extra character.
j++;
} else {
// Both strings are the same length. This can only happen if the two
// strings differ by a single character.
i++;
j++;
}
}
}
if (i != str1.end() || j != str2.end()) {
// A character at the end did not match.
edit_count++;
}
return edit_count <= 1;
}

if (SkeletonsMatch(navigated_skeletons, skeletons))
return detail.origin.host();
// static
std::string LookalikeUrlNavigationObserver::GetSimilarDomainFromTop500(
const std::string& domain_and_registry) {
if (!url_formatter::top_domains::IsEditDistanceCandidate(
domain_and_registry)) {
return std::string();
}

for (const std::string& skeleton :
url_formatter::GetSkeletons(base::UTF8ToUTF16(domain_and_registry))) {
for (const char* const top_domain_skeleton : kTop500) {
if (IsEditDistanceAtMostOne(base::UTF8ToUTF16(skeleton),
base::UTF8ToUTF16(top_domain_skeleton))) {
return url_formatter::LookupSkeletonInTopDomains(top_domain_skeleton);
}
}
}
return std::string();
}

// static
void LookalikeUrlNavigationObserver::CreateForWebContents(
content::WebContents* web_contents) {
DCHECK(web_contents);
if (!FromWebContents(web_contents)) {
web_contents->SetUserData(
UserDataKey(),
std::make_unique<LookalikeUrlNavigationObserver>(web_contents));
}
}

WEB_CONTENTS_USER_DATA_KEY_IMPL(LookalikeUrlNavigationObserver)
34 changes: 28 additions & 6 deletions chrome/browser/ui/omnibox/lookalike_url_navigation_observer.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,23 @@ class LookalikeUrlNavigationObserver
kLinkClicked = 2,
kMatchTopSite = 3,
kMatchSiteEngagement = 4,
kMatchEditDistance = 5,

// Append new items to the end of the list above; do not modify or
// replace existing values. Comment out obsolete items.
kMaxValue = kMatchSiteEngagement,
kMaxValue = kMatchEditDistance,
};

// Used for UKM. There is only a single MatchType per navigation.
enum class MatchType {
kNone = 0,
kTopSite = 1,
kSiteEngagement = 2,
kEditDistance = 3,

// Append new items to the end of the list above; do not modify or replace
// existing values. Comment out obsolete items.
kMaxValue = kSiteEngagement,
kMaxValue = kEditDistance,
};

static const char kHistogramName[];
Expand All @@ -57,10 +59,30 @@ class LookalikeUrlNavigationObserver

private:
friend class content::WebContentsUserData<LookalikeUrlNavigationObserver>;
// Returns a site that the user has used before that |url| may be attempting
// to spoof, based on skeleton comparison.
std::string GetMatchingSiteEngagementDomain(SiteEngagementService* service,
const GURL& url);
FRIEND_TEST_ALL_PREFIXES(LookalikeUrlNavigationObserverTest,
IsEditDistanceAtMostOne);

// Returns true if a domain is visually similar to the hostname of |url|. The
// matching domain can be a top domain or an engaged site. Similarity check
// is made using both visual skeleton and edit distance comparison. If this
// returns true, match details will be written into |matched_domain| and
// |match_type|. They cannot be nullptr.
bool GetMatchingDomain(const GURL& url,
SiteEngagementService* service,
std::string* matched_domain,
MatchType* match_type);

// Returns if the Levenshtein distance between |str1| and |str2| is at most 1.
// This has O(max(n,m)) complexity as opposed to O(n*m) of the usual edit
// distance computation.
static bool IsEditDistanceAtMostOne(const base::string16& str1,
const base::string16& str2);

// Returns the first matching top domain with an edit distance of at most one
// to |domain_and_registry|.
static std::string GetSimilarDomainFromTop500(
const std::string& domain_and_registry);

WEB_CONTENTS_USER_DATA_KEY_DECL();
};

Expand Down
Loading

0 comments on commit 5b516e0

Please sign in to comment.