The following program first create a simulated intranet by creating lots of html page under a directory with java code.
For the simulated intranet, the following steps are taken:
- run a simple http server at local port 8080
python -m SimpleHTTPServer 8080
- set the following entry in the /etc/hosts
127.0.0.1 localhost
127.0.0.1 localhost0
127.0.0.1 localhost1
127.0.0.1 localhost2
127.0.0.1 localhost3
127.0.0.1 localhost4
127.0.0.1 localhost5
127.0.0.1 localhost6
127.0.0.1 localhost7
127.0.0.1 localhost8
- for controlled LAN, use in the java code
controledLAN();
- for random LAN, comment controledLAN() and un-comment randomLAN() in the java code
//randomLAN();
The program's crawler starts from a start url, parallelly crawl through the whole LAN (host0 to host8). If hostA has any outbound link to hostB, then hostA has a directed edge to hostB. Once the crawler finish crawling all the reachable links, a digraph of connected hosts are generated. The program then calculate the strongly connected components with Kosaraju algorithms.
The digraph, reverse digraph, reverse digraph's depth-first-order pre order, post order, reverse post order are printed for peeking into the algorithm's calling stack.
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.io.*;
import java.net.*;
import java.nio.file.*;
public class ParallelCrawler {
// python -m SimpleHTTPServer 8080
// local training directory http://localhost:8080/webdir/
private static final String WEBDIR = "/Users/homenetwork/webdir";
private static final String STARTURL = "http://localhost1:8080/webdir/1-v0.html";
private static final Set<String> visited = new ConcurrentSkipListSet<>();
private static final ConcurrentMap<String, Set<String>> digraph = new ConcurrentHashMap<>();
private AtomicInteger limit = new AtomicInteger();
private static final int SEARCHLIMIT = 1000;
private static final Map<String, Set<String>> digraphCtr = new HashMap<>();
private static StringBuilder sb = new StringBuilder();
private static int LANSIZE = 200; // internet size in infinite
private static Random r = new Random();
private static void randomLAN() {
for (int i = 0; i < 10; i++) { // total number of sites
int numV = r.nextInt(LANSIZE / 10); // number of pages in each site
for (int j = 0; j < numV; j++) {
Set<String> v = new HashSet<>();
for (int k = 0; k < r.nextInt(5); k++) { // internal links per
// page
v.add(i + "-v" + r.nextInt(numV));
}
if (r.nextBoolean()) {
v.add(r.nextInt(9) + "-v" + r.nextInt(numV)); // outside
// link
}
digraphCtr.put(i + "-v" + j, v);
}
}
}
private static void controledLAN() {
digraphCtr.put("1-v0", Arrays.asList("1-v1", "2-v0").stream()
.collect(Collectors.toSet()));
digraphCtr.put("1-v1", Arrays.asList("1-v0", "1-v5", "1-v4").stream()
.collect(Collectors.toSet()));
digraphCtr.put("2-v0", Arrays.asList("1-v1", "2-v1", "2-v3").stream()
.collect(Collectors.toSet()));
digraphCtr.put("2-v1", Arrays.asList("1-v1", "2-v0", "3-v0").stream()
.collect(Collectors.toSet()));
digraphCtr.put("3-v0", Arrays.asList("3-v1").stream()
.collect(Collectors.toSet()));
}
/*
* added entries in /etc/hosts localhostx 127.0.0.1 1-v0 1 is localhost1,
* page 1-v0 -- http://localhost1/webdir/1-v0.html
*/
private static void internetSim() throws IOException {
controledLAN();
//randomLAN();
Set<Path> htmls = Files.walk(Paths.get(WEBDIR))
.filter(i -> i.toString().endsWith("html"))
.collect(Collectors.toSet());
for (Path p : htmls) {
Files.deleteIfExists(p);
}
for (String key : digraphCtr.keySet()) {
sb.setLength(0);
sb.append("<html>").append("page " + key).append("</br>");
digraphCtr.get(key).forEach(
v -> sb.append("<a href=\"http://localhost" + v.charAt(0)
+ ":8080/webdir/" + v + ".html\">" + "page " + v
+ " link</a></br>"));
sb.append("</html>");
Files.write(Paths.get(WEBDIR + "/" + key + ".html"), sb.toString()
.getBytes());
}
System.out.println("generated " + digraphCtr.size()
+ " html files under directory " + WEBDIR);
try {
System.out.println("give 2 seconds for stablize");
Thread.sleep(2000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public static void main(String... args) {
try {
internetSim();
Set<URL> urls = new HashSet<>();
urls.add(new URL(STARTURL));
ParallelCrawler pc = new ParallelCrawler();
pc.crawUrls(urls);
} catch (IOException e) {
e.printStackTrace();
}
Set<String> totalBagNodes = digraphCtr.values().stream()
.flatMap(i -> i.stream()).distinct()
.collect(Collectors.toSet());
Set<String> totalVertexNodes = digraphCtr.keySet().stream().distinct()
.collect(Collectors.toSet());
totalVertexNodes.addAll(totalBagNodes);
System.out.println("visited " + visited.size() + " nodes out of total "
+ totalVertexNodes.size());
System.out.println("digraph: " + digraph);
KosarajuSCC scc = new KosarajuSCC(digraph);
System.out.println("strongly connected components: " + scc.getIds());
}
private void crawUrls(Set<URL> urls) {
if (limit.incrementAndGet() >= SEARCHLIMIT)
return;
urls.parallelStream().forEach(url -> {
Set<URL> adj = getURLs(url);
if (adj != null)
crawUrls(adj);
});
}
private Set<URL> getURLs(URL url) {
if (visited.contains(url.toString()))
return null;
visited.add(url.toString());
Set<URL> urls = new HashSet<>();
try (BufferedReader br = new BufferedReader(new InputStreamReader(
url.openStream()));) {
Set<String> tmp = br.lines().filter(i -> i.contains("href"))
.map(i -> HTMLLinkExtractor2.grabHTMLLinks(i))
.flatMap(i -> i.stream()).collect(Collectors.toSet());
for (String i : tmp) {
System.out.println(i);
urls.add(new URL(i));
}
digraph.merge(
url.getHost(),
urls.stream().map(URL::getHost)
.filter(i -> !i.equals(url.getHost()))
.collect(Collectors.toSet()), (ov, nv) -> {
ov.addAll(nv);
return ov;
});
} catch (FileNotFoundException e) {
// do nothing
} catch (IOException e) {
e.printStackTrace();
}
return urls;
}
}
class KosarajuSCC {
private Set<String> marked = new HashSet<>();
private Map<String, Integer> ids = new HashMap<>();
private int count;
// reached vertices
// component identifiers
// number of strong components
public KosarajuSCC(Map<String, Set<String>> G) {
System.out.println("reverse digraph: " + reverse(G));
DepthFirstOrder order = new DepthFirstOrder(reverse(G));
String s = null;
System.out.println("\npre order");
while((s = order.pre().poll()) != null) {
System.out.print(s + " -> ");
}
System.out.println("\npost order");
while((s = order.post().poll()) != null) {
System.out.print(s + " -> ");
}
System.out.println("\nreversePost order");
Stack<String> stack = order.reversePost();
while(!stack.isEmpty()) {
s = stack.pop();
System.out.print(s + " -> ");
if (!marked.contains(s)) {
dfs(G, s);
count++;
}
}
System.out.println();
}
private Map<String, Set<String>> reverse(Map<String, Set<String>> g) {
Map<String, Set<String>> reverse = new HashMap<>();
for(String k : g.keySet()) {
for(String v : g.get(k)) {
reverse.putIfAbsent(v, new HashSet<>());
reverse.get(v).add(k);
}
}
return reverse;
}
private void dfs(Map<String, Set<String>> G, String v) {
marked.add(v);
ids.put(v, count);
if(G.get(v) == null) return;
for (String w : G.get(v))
if (!marked.contains(w))
dfs(G, w);
}
public boolean stronglyConnected(int v, int w) {
return ids.get(v) == ids.get(w);
}
public Map<String, Integer> getIds() {
return ids;
}
public int count() {
return count;
}
}
class DepthFirstOrder {
private Set<String> marked = new HashSet<>();
private Queue<String> pre;
private Queue<String> post;
private Stack<String> reversePost; // vertices in reverse postorder
public DepthFirstOrder(Map<String, Set<String>> revg) {
pre = new ArrayDeque<String>();
post = new ArrayDeque<String>();
reversePost = new Stack<String>();
for (String key : revg.keySet()) {
if (!marked.contains(key))
dfs(revg, key);
}
}
private void dfs(Map<String, Set<String>> tmpg2, String key) {
pre.offer(key);
marked.add(key);
if(tmpg2.get(key) != null) {
for (String w : tmpg2.get(key))
if (!marked.contains(w))
dfs(tmpg2, w);
}
post.offer(key);
reversePost.push(key);
}
public Queue<String> pre() {
return pre;
}
public Queue<String> post() {
return post;
}
public Stack<String> reversePost() {
return reversePost;
}
}
class HTMLLinkExtractor2 {
private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>";
private static final String HTML_A_HREF_TAG_PATTERN = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
public static Set<String> grabHTMLLinks(final String html) {
Matcher matcherLink;
Pattern patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
Pattern patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
Matcher matcherTag = patternTag.matcher(html);
Set<String> result = new HashSet<String>();
while (matcherTag.find()) {
String href = matcherTag.group(1); // href
matcherLink = patternLink.matcher(href);
while (matcherLink.find()) {
String link = matcherLink.group(1); // link
link = link.replaceAll("'", "");
link = link.replaceAll("\"", "");
result.add(link);
}
}
return result;
}
}
This parallel crawler only works in LAN with limited hosts, in internet when the hosts are infinite, the crawling takes forever.