Skip to content

Commit

Permalink
Merge pull request yasserg#117 from rvenutolo/robotstxtuseragentlower…
Browse files Browse the repository at this point in the history
…case

When checking robots.txt directive user agent against crawler user agent use lowercase.
  • Loading branch information
yasserg committed Apr 4, 2016
2 parents 996f6d0 + 9076c81 commit 4a89f0a
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public static HostDirectives parse(String content, String myUserAgent) {

if (line.matches(PATTERNS_USERAGENT)) {
String ua = line.substring(PATTERNS_USERAGENT_LENGTH).trim().toLowerCase();
inMatchingUserAgent = "*".equals(ua) || ua.contains(myUserAgent);
inMatchingUserAgent = "*".equals(ua) || ua.contains(myUserAgent.toLowerCase());
} else if (line.matches(PATTERNS_DISALLOW)) {
if (!inMatchingUserAgent) {
continue;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package edu.uci.ics.crawler4j.tests;

import edu.uci.ics.crawler4j.robotstxt.HostDirectives;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtParser;
import org.junit.Test;

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;

public class RobotstxtParserNonLowercaseUserAgentTest {

@Test
public void testParseWithNonLowercaseUserAgent() {
String userAgent = "testAgent";
String content = "User-agent: " + userAgent + "\n"
+ "Disallow: /test/path/\n";
HostDirectives hostDirectives = RobotstxtParser.parse(content, userAgent);
assertNotNull("parsed HostDirectives is null", hostDirectives);
assertFalse("HostDirectives should not allow path: '/test/path/'", hostDirectives.allows("/test/path/"));
}

}

0 comments on commit 4a89f0a

Please sign in to comment.