Skip to content

Commit

Permalink
an optimization suggested by Gordon: organize cookies by host (domain…
Browse files Browse the repository at this point in the history
…), rather than top private domain, and supply a composite list of cookies from the host and each parent domain to the http client library
  • Loading branch information
nlevitt committed Sep 30, 2014
1 parent 5fd120d commit 8d98882
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.Date;
Expand All @@ -46,8 +47,6 @@
import org.archive.spring.ConfigPath;
import org.springframework.context.Lifecycle;

import com.google.common.net.InternetDomainName;

abstract public class AbstractCookieStore implements Lifecycle, Checkpointable,
CookieStore, FetchHTTPCookieStore {

Expand Down Expand Up @@ -131,7 +130,7 @@ public void saveCookies(String saveCookiesFile) {
String tab ="\t";
out.write("# Heritrix Cookie File\n".getBytes());
out.write("# This file is the Netscape cookies.txt format\n\n".getBytes());
for (Cookie cookie: getCookies()) {
for (Cookie cookie: new ArrayList<Cookie>(getCookies())) {
// Guess an initial size
MutableString line = new MutableString(1024 * 2);
line.append(cookie.getDomain());
Expand Down Expand Up @@ -246,62 +245,38 @@ public void addCookie(Cookie cookie) {
}

/**
* Returns a string that uniquely identifies the cookie, and is prepended
* with the top private domain (one level below the TLD) associated with the
* cookie. This way such cookies can be grouped together in a sorted list,
* for example. The format The format of the key is
* {@code "topPrivateDomain;normalizedDomain;name;path"}. Adapted from
* Returns a string that uniquely identifies the cookie, The format The
* format of the key is {@code "normalizedDomain;name;path"}. Adapted from
* {@link CookieIdentityComparator#compare(Cookie, Cookie)}.
*/
protected String sortableKey(Cookie cookie) {
String normalizedDomain = normalizeDomain(cookie.getDomain());
String topPrivateDomain = topPrivateDomain(normalizedDomain);
String normalizedDomain = normalizeHost(cookie.getDomain());

// use ";" as delimiter since it is the delimiter in the cookie header,
// so presumably can't appear in any of these values
StringBuilder buf = new StringBuilder(topPrivateDomain);
buf.append(";").append(normalizedDomain);
StringBuilder buf = new StringBuilder(normalizedDomain);
buf.append(";").append(cookie.getName());
buf.append(";").append(cookie.getPath() != null ? cookie.getPath() : "/");

return buf.toString();
}

/**
* Returns the top private domain, i.e. the topmost assigned domain, one
* level below the TLD, for the supplied {@code host}. Returns
* {@code host} unaltered if a top private domain can't be identified (for
* example, if {@code host} is an IP address).
*/
protected String topPrivateDomain(String host) {
if (InternetDomainName.isValid(host)) {
InternetDomainName d = InternetDomainName.from(host);
if (d.hasPublicSuffix()) {
return d.topPrivateDomain().toString();
}
}

return host;
}

protected String normalizeDomain(String domain) {
if (domain == null) {
domain = "";
protected String normalizeHost(String host) {
if (host == null) {
host = "";
}
if (domain.startsWith(".")) {
domain = domain.substring(1);
if (host.startsWith(".")) {
host = host.substring(1);
}
domain = domain.toLowerCase(Locale.ENGLISH);
return domain;
host = host.toLowerCase(Locale.ENGLISH);
return host;
}

public CookieStore cookieStoreFor(CrawlURI curi) throws URIException {
String normalizedDomain = normalizeDomain(curi.getUURI().getHost());
String topPrivateDomain = topPrivateDomain(normalizedDomain);
return cookieStoreFor(topPrivateDomain);
String normalizedHost = normalizeHost(curi.getUURI().getHost());
return cookieStoreFor(normalizedHost);
}

abstract public CookieStore cookieStoreFor(String topPrivateDomain);
abstract public void addCookie(Cookie cookie);
abstract public void clear();
abstract protected void prepare();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@
import java.util.ListIterator;
import java.util.SortedMap;

import org.apache.commons.collections.collection.CompositeCollection;
import org.apache.http.client.CookieStore;
import org.apache.http.cookie.Cookie;
import org.archive.bdb.BdbModule;
import org.archive.checkpointing.Checkpoint;
import org.springframework.beans.factory.annotation.Autowired;

import com.google.common.net.InternetDomainName;
import com.sleepycat.bind.ByteArrayBinding;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.bind.serial.StoredClassCatalog;
Expand All @@ -43,10 +45,10 @@

/**
* Cookie store using bdb for storage. Cookies are stored in a SortedMap keyed
* by {@link #sortableKey(Cookie)}, so they are grouped together by top private
* domain. {@link #cookieStoreFor(String)} returns a facade whose
* by {@link #sortableKey(Cookie)}, so they are grouped together by domain.
* {@link #cookieStoreFor(String)} returns a facade whose
* {@link CookieStore#getCookies()} returns a list of cookies limited to
* subdomains of the supplied top private domain.
* the supplied host and parent domains, if applicable.
*
* @see https://webarchive.jira.com/browse/HER-2070
* @see https://github.com/internetarchive/heritrix3/pull/96
Expand All @@ -55,7 +57,7 @@
* @contributor nlevitt
*/
public class BdbCookieStore extends AbstractCookieStore implements
FetchHTTPCookieStore, CookieStore {
FetchHTTPCookieStore, CookieStore {

/**
* A {@link List} implementation that wraps a {@link Collection}. Needed
Expand Down Expand Up @@ -142,26 +144,51 @@ public void addCookie(Cookie cookie) {
}
}

/**
* Returns a {@link LimitedCookieStoreFacade} whose
* {@link LimitedCookieStoreFacade#getCookies()} method returns only the
* cookies from the domain {@code topPrivateDomainOrIP} and subdomains.
*/
public CookieStore cookieStoreFor(String topPrivateDomainOrIP) {
SortedMap<byte[], Cookie> domainCookiesSubMap;
protected Collection<Cookie> hostSubset(String host) {
try {
byte[] startKey = topPrivateDomainOrIP.getBytes("UTF-8");
byte[] startKey = (host + ";").getBytes("UTF-8");

char chAfterDelim = (char)(((int)';')+1);
byte[] endKey = (topPrivateDomainOrIP + chAfterDelim).getBytes("UTF-8");
domainCookiesSubMap = cookies.subMap(startKey, endKey);
byte[] endKey = (host + chAfterDelim).getBytes("UTF-8");

SortedMap<byte[], Cookie> submap = cookies.subMap(startKey, endKey);
return submap.values();

} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e); // impossible
}
}

Collection<Cookie> domainCookiesCollection = domainCookiesSubMap.values();
List<Cookie> domainCookiesList = new RestrictedCollectionWrappedList<Cookie>(domainCookiesCollection);
/**
* Returns a {@link LimitedCookieStoreFacade} whose
* {@link LimitedCookieStoreFacade#getCookies()} method returns only cookies
* from {@code host} and its parent domains, if applicable.
*/
public CookieStore cookieStoreFor(String host) {
CompositeCollection cookieCollection = new CompositeCollection();

if (InternetDomainName.isValid(host)) {
InternetDomainName domain = InternetDomainName.from(host);

while (domain != null) {
Collection<Cookie> subset = hostSubset(domain.toString());
cookieCollection.addComposited(subset);

if (domain.hasParent()) {
domain = domain.parent();
} else {
domain = null;
}
}
} else {
Collection<Cookie> subset = hostSubset(host.toString());
cookieCollection.addComposited(subset);
}

return new LimitedCookieStoreFacade(domainCookiesList);
@SuppressWarnings("unchecked")
List<Cookie> cookieList = new RestrictedCollectionWrappedList<Cookie>(cookieCollection);
LimitedCookieStoreFacade store = new LimitedCookieStoreFacade(cookieList);
return store;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@
public interface FetchHTTPCookieStore extends CookieStore {
/**
* Returns a {@link CookieStore} whose {@link CookieStore#getCookies()}
* returns all the cookies from {@code topPrivateDomain} and its subdomains.
* returns all the cookies from {@code host} and each of its
* parent domains, if applicable.
*/
public CookieStore cookieStoreFor(String topPrivateDomain);
public CookieStore cookieStoreFor(String host);

/**
* Returns a {@link CookieStore} whose {@link CookieStore#getCookies()}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public boolean clearExpired(Date date) {
}

@Override
public CookieStore cookieStoreFor(String topPrivateDomain) {
public CookieStore cookieStoreFor(String host) {
return this;
}

Expand Down

0 comments on commit 8d98882

Please sign in to comment.