Site Search:

Strongly Connected Components in simulated intranet

Back>




The following program first create a simulated intranet by creating lots of html page under a directory with java code.

For the simulated intranet, the following steps are taken:


  • run a simple http server at local port 8080 
python -m SimpleHTTPServer 8080
  • set the following entry in the /etc/hosts
127.0.0.1 localhost
127.0.0.1 localhost0
127.0.0.1 localhost1
127.0.0.1 localhost2
127.0.0.1 localhost3
127.0.0.1 localhost4
127.0.0.1 localhost5
127.0.0.1 localhost6
127.0.0.1 localhost7
127.0.0.1 localhost8
  • for controlled LAN, use in the java code
controledLAN();
  • for random LAN, comment controledLAN() and un-comment randomLAN() in the java code
//randomLAN();

The program's crawler starts from a start url, parallelly crawl through the whole LAN (host0 to host8). If hostA has any outbound link to hostB, then hostA has a directed edge to hostB. Once the crawler finish crawling all the reachable links, a digraph of connected hosts are generated. The program then calculate the strongly connected components with Kosaraju algorithms.

The digraph, reverse digraph, reverse digraph's depth-first-order pre order, post order, reverse post order are printed for peeking into the algorithm's calling stack.

import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.io.*;
import java.net.*;
import java.nio.file.*;

public class ParallelCrawler {
    // python -m SimpleHTTPServer 8080
    // local training directory http://localhost:8080/webdir/
    private static final String WEBDIR = "/Users/homenetwork/webdir";
    private static final String STARTURL = "http://localhost1:8080/webdir/1-v0.html";
    private static final Set<String> visited = new ConcurrentSkipListSet<>();
    private static final ConcurrentMap<String, Set<String>> digraph = new ConcurrentHashMap<>();
    private AtomicInteger limit = new AtomicInteger();
    private static final int SEARCHLIMIT = 1000;

    private static final Map<String, Set<String>> digraphCtr = new HashMap<>();
    private static StringBuilder sb = new StringBuilder();

    private static int LANSIZE = 200; // internet size in infinite
    private static Random r = new Random();
    private static void randomLAN() {
        for (int i = 0; i < 10; i++) { // total number of sites
            int numV = r.nextInt(LANSIZE / 10); // number of pages in each site
            for (int j = 0; j < numV; j++) {
                Set<String> v = new HashSet<>();
                for (int k = 0; k < r.nextInt(5); k++) { // internal links per
                                                         // page
                    v.add(i + "-v" + r.nextInt(numV));
                }
                if (r.nextBoolean()) {
                    v.add(r.nextInt(9) + "-v" + r.nextInt(numV)); // outside
                                                                  // link
                }
                digraphCtr.put(i + "-v" + j, v);
            }
        }
    }

    private static void controledLAN() {
        digraphCtr.put("1-v0", Arrays.asList("1-v1", "2-v0").stream()
                .collect(Collectors.toSet()));
        digraphCtr.put("1-v1", Arrays.asList("1-v0", "1-v5", "1-v4").stream()
                .collect(Collectors.toSet()));
        digraphCtr.put("2-v0", Arrays.asList("1-v1", "2-v1", "2-v3").stream()
                .collect(Collectors.toSet()));
        digraphCtr.put("2-v1", Arrays.asList("1-v1", "2-v0", "3-v0").stream()
                .collect(Collectors.toSet()));
        digraphCtr.put("3-v0", Arrays.asList("3-v1").stream()
                .collect(Collectors.toSet()));
    }

    /*
     * added entries in /etc/hosts localhostx 127.0.0.1 1-v0 1 is localhost1,
     * page 1-v0 -- http://localhost1/webdir/1-v0.html
     */
    private static void internetSim() throws IOException {
        controledLAN();
        //randomLAN();
        Set<Path> htmls = Files.walk(Paths.get(WEBDIR))
                .filter(i -> i.toString().endsWith("html"))
                .collect(Collectors.toSet());
        for (Path p : htmls) {
            Files.deleteIfExists(p);
        }
        for (String key : digraphCtr.keySet()) {
            sb.setLength(0);
            sb.append("<html>").append("page " + key).append("</br>");
            digraphCtr.get(key).forEach(
                    v -> sb.append("<a href=\"http://localhost" + v.charAt(0)
                            + ":8080/webdir/" + v + ".html\">" + "page " + v
                            + " link</a></br>"));
            sb.append("</html>");
            Files.write(Paths.get(WEBDIR + "/" + key + ".html"), sb.toString()
                    .getBytes());
        }
        System.out.println("generated " + digraphCtr.size()
                + " html files under directory " + WEBDIR);

        try {
            System.out.println("give 2 seconds for stablize");
            Thread.sleep(2000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    public static void main(String... args) {
        try {
            internetSim();
            Set<URL> urls = new HashSet<>();
            urls.add(new URL(STARTURL));
            ParallelCrawler pc = new ParallelCrawler();
            pc.crawUrls(urls);
        } catch (IOException e) {
            e.printStackTrace();
        }

        Set<String> totalBagNodes = digraphCtr.values().stream()
                .flatMap(i -> i.stream()).distinct()
                .collect(Collectors.toSet());
        Set<String> totalVertexNodes = digraphCtr.keySet().stream().distinct()
                .collect(Collectors.toSet());
        totalVertexNodes.addAll(totalBagNodes);
        System.out.println("visited " + visited.size() + " nodes out of total "
                + totalVertexNodes.size());
        System.out.println("digraph: " + digraph);
        
        KosarajuSCC scc = new KosarajuSCC(digraph);
        System.out.println("strongly connected components: " + scc.getIds());
    }

    private void crawUrls(Set<URL> urls) {
        if (limit.incrementAndGet() >= SEARCHLIMIT)
            return;
        urls.parallelStream().forEach(url -> {
            Set<URL> adj = getURLs(url);
            if (adj != null)
                crawUrls(adj);
        });
    }

    private Set<URL> getURLs(URL url) {
        if (visited.contains(url.toString()))
            return null;
        visited.add(url.toString());
        Set<URL> urls = new HashSet<>();
        try (BufferedReader br = new BufferedReader(new InputStreamReader(
                url.openStream()));) {
            Set<String> tmp = br.lines().filter(i -> i.contains("href"))
                    .map(i -> HTMLLinkExtractor2.grabHTMLLinks(i))
                    .flatMap(i -> i.stream()).collect(Collectors.toSet());

            for (String i : tmp) {
                System.out.println(i);
                urls.add(new URL(i));
            }
            digraph.merge(
                    url.getHost(),
                    urls.stream().map(URL::getHost)
                            .filter(i -> !i.equals(url.getHost()))
                            .collect(Collectors.toSet()), (ov, nv) -> {
                        ov.addAll(nv);
                        return ov;
                    });
        } catch (FileNotFoundException e) {
            // do nothing
        } catch (IOException e) {
            e.printStackTrace();
        }
        return urls;
    }
}

class KosarajuSCC {
    private Set<String> marked = new HashSet<>();
    private Map<String, Integer> ids = new HashMap<>();
    private int count;

    // reached vertices
    // component identifiers
    // number of strong components
    public KosarajuSCC(Map<String, Set<String>> G) {
        System.out.println("reverse digraph: " + reverse(G));
        DepthFirstOrder order = new DepthFirstOrder(reverse(G));
        String s = null;
        System.out.println("\npre order");
        while((s = order.pre().poll()) != null) {
            System.out.print(s + " -> ");
        }
        System.out.println("\npost order");
        while((s = order.post().poll()) != null) {
            System.out.print(s + " -> ");
        }
        System.out.println("\nreversePost order");
        Stack<String> stack = order.reversePost();
        while(!stack.isEmpty()) {
            s = stack.pop();
            System.out.print(s + " -> ");
            if (!marked.contains(s)) {
                dfs(G, s);
                count++;
            }
        }
        System.out.println();
    }
    
    private Map<String, Set<String>> reverse(Map<String, Set<String>> g) {
        Map<String, Set<String>> reverse = new HashMap<>();
        for(String k : g.keySet()) {
            for(String v : g.get(k)) {
                reverse.putIfAbsent(v, new HashSet<>());
                reverse.get(v).add(k);
            }
        }
        return reverse;
    }

    private void dfs(Map<String, Set<String>> G, String v) {
        marked.add(v);
        ids.put(v, count);
        if(G.get(v) == null) return;
        for (String w : G.get(v))
            if (!marked.contains(w))
                dfs(G, w);
    }

    public boolean stronglyConnected(int v, int w) {
        return ids.get(v) == ids.get(w);
    }

    public Map<String, Integer> getIds() {
        return ids;
    }

    public int count() {
        return count;
    }
}

class DepthFirstOrder {
    private Set<String> marked = new HashSet<>();
    private Queue<String> pre;
    private Queue<String> post;
    private Stack<String> reversePost; // vertices in reverse postorder

    public DepthFirstOrder(Map<String, Set<String>> revg) {
        pre = new ArrayDeque<String>();
        post = new ArrayDeque<String>();
        reversePost = new Stack<String>();
        for (String key : revg.keySet()) {
            if (!marked.contains(key))
                dfs(revg, key);
        }
    }

    private void dfs(Map<String, Set<String>> tmpg2, String key) {
        pre.offer(key);
        marked.add(key);
        if(tmpg2.get(key) != null) {
            for (String w : tmpg2.get(key))
                if (!marked.contains(w))
                    dfs(tmpg2, w);
        }
        
        post.offer(key);
        reversePost.push(key);
    }

    public Queue<String> pre() {
        return pre;
    }

    public Queue<String> post() {
        return post;
    }

    public Stack<String> reversePost() {
        return reversePost;
    }
}

class HTMLLinkExtractor2 {
    private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>";
    private static final String HTML_A_HREF_TAG_PATTERN = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";

    public static Set<String> grabHTMLLinks(final String html) {
        Matcher matcherLink;
        Pattern patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
        Pattern patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
        Matcher matcherTag = patternTag.matcher(html);
        Set<String> result = new HashSet<String>();
        while (matcherTag.find()) {
            String href = matcherTag.group(1); // href
            matcherLink = patternLink.matcher(href);
            while (matcherLink.find()) {
                String link = matcherLink.group(1); // link
                link = link.replaceAll("'", "");
                link = link.replaceAll("\"", "");
                result.add(link);
            }
        }
        return result;
    }

}

This parallel crawler only works in LAN with limited hosts, in internet when the hosts are infinite, the crawling takes forever.