forked from HarryDulaney/intro-to-java-programming
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExercise12_33.java
80 lines (73 loc) · 3.02 KB
/
Exercise12_33.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
package ch_12;
import java.io.BufferedInputStream;
import java.util.ArrayList;
import java.util.Scanner;
/**
* **12.33 (Search Web) Modify Listing 12.18 WebCrawler.java to search for the word
* Computer Programming starting from the URL http://cs.armstrong.edu/liang.
* Your program terminates once the word is found. Display the URL for the page
* that contains the word.
* <p>
* Because http://cs.armstrong.edu/liang is no longer active I substituted that with:
* "https://en.wikipedia.org/wiki/Computer"
*/
public class Exercise12_33 {
private static final String SEARCH_WORD = "Computer Programming";
public static void main(String[] args) {
String startURL = "https://en.wikipedia.org/wiki/Computer";
crawl(startURL);
}
public static void crawl(String startingURL) {
ArrayList<String> listOfPendingURLs = new ArrayList<>();
ArrayList<String> listOfTraversedURLs = new ArrayList<>();
listOfPendingURLs.add(startingURL);
while (!listOfPendingURLs.isEmpty() &&
listOfTraversedURLs.size() <= 100) {
String urlString = listOfPendingURLs.remove(0);
if (!listOfTraversedURLs.contains(urlString)) {
listOfTraversedURLs.add(urlString);
System.out.println("Crawling: " + urlString);
for (String s : getSubURLs(urlString)) {
if (!listOfTraversedURLs.contains(s))
listOfPendingURLs.add(s);
}
}
}
}
public static ArrayList<String> getSubURLs(String urlString) {
ArrayList<String> list = new ArrayList<>();
try {
java.net.URL url = new java.net.URL(urlString);
Scanner input = new Scanner(url.openStream());
int lineNumber = 0;
while (input.hasNextLine()) {
lineNumber++;
String line = input.nextLine();
/* Solution Start */
if (line.contains(SEARCH_WORD) /*|| line.contains(SEARCH_WORD.toLowerCase())*/) {
System.out.println("Search phrase found on page: " + urlString + " @ lineNumber: " + lineNumber);
System.out.println(line);
System.exit(0);
}
}
/* Solution End*/
int current = 0;
while (input.hasNextLine()) {
String line = input.nextLine();
current = line.indexOf("https:", current);
while (current > 0) {
int endIndex = line.indexOf("\"", current);
if (endIndex > 0) { // Ensure that a correct URL is found
list.add(line.substring(current, endIndex));
current = line.indexOf("https:", endIndex);
} else {
current = -1;
}
}
}
} catch (Exception ex) {
System.out.println("Error: " + ex.getMessage());
}
return list;
}
}