import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.*;
import java.util.concurrent.*;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* WebCrawler
* <p/>
* Using TrackingExecutorService to save unfinished tasks for later execution
* The multi-threaded crawler is using BFS
*/
public abstract class BFSWebCrawler {
private static final long SEARCH_TIME = 1000;
public static final String SEARCH = "Honorificabilitudinitatibus";
public static final String startpage = "http://xyzcode.blogspot.com/2017/08/test-crawler.html";
private static final Predicate<String> urlPred = i -> i.contains("html")
&& i.contains("xyzcode");
private volatile TrackingExecutor exec;
private final Set<String> urlsToCrawl = new ConcurrentSkipListSet<>();
private final ConcurrentMap<String, Boolean> seen = new ConcurrentHashMap<>();
private final static ConcurrentMap<String, Integer> found = new ConcurrentHashMap<>();
private final ConcurrentMap<String, Integer> problem = new ConcurrentHashMap<>();
private static final long TIMEOUT = 10;
private static final TimeUnit UNIT = TimeUnit.SECONDS;
private volatile static long Delay = 100;
private static final long LIMIT = 5000; //max delay time
public static void main(String... args) {
try {
BFSWebCrawler crawler = new BFSWebCrawler(startpage) {
@Override
protected Set<String> processPage(String url) {
if (url != null) {
Set<String> urls = getContainedUrls(url);
return urls;
}
return new HashSet<String>();
}
};
System.out.println("Will finish searching " + SEARCH
+ " start from " + startpage + " in: " + SEARCH_TIME / 1000
+ " seconds");
crawler.start();
Thread.sleep(SEARCH_TIME);
crawler.stop();
System.out.println("url visited: " + crawler.getSeenURLs().size());
System.out.println("occurance of "+ SEARCH + ": "+ found.values().stream()
.mapToInt(i -> i.intValue()).sum());
System.out.println(found.keySet());
} catch (Exception e) {
if (e instanceof InterruptedException)
Thread.currentThread().interrupt();
}
}
//equivalent to graph API: Bag<Vertex> adjacentTo(Vertex v);
private static Set<String> getContainedUrls(String url) {
String line;
Set<String> urls = new HashSet<>();
try (BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openStream())) ){
while ((line = br.readLine()) != null) {
Matcher m = Pattern.compile(SEARCH).matcher(line); //waste memory, yes, but thread safe
while (m.find())
found.merge(url, 1, (v1, v2) -> v1 = v1 + 1);
urls.addAll(new HTMLLinkExtractor().grabHTMLLinks(line)
.stream().filter(urlPred).collect(Collectors.toSet()));
}
} catch (IOException ioe) {
if (ioe.getMessage().contains(
"Server returned HTTP response code: 503")) {
Delay = Delay < LIMIT ? Delay * 2 : LIMIT;
try {
Thread.sleep(Delay);
} catch (InterruptedException e) {
}
}
}
return urls;
}
public BFSWebCrawler(String startUrl) {
urlsToCrawl.add(startUrl);
}
public final Set<String> getSeenURLs() {
return this.seen.keySet();
}
public final Map<String, Integer> getProblemURLs() {
return this.problem;
}
public synchronized void start() {
exec = new TrackingExecutor(Executors.newCachedThreadPool());
for (String url : urlsToCrawl)
submitCrawlTask(url.toString());
urlsToCrawl.clear();
}
public synchronized void stop() throws InterruptedException {
try {
saveUncrawled(exec.shutdownNow());
if (exec.awaitTermination(TIMEOUT, UNIT))
saveUncrawled(exec.getCancelledTasks());
} catch (Exception e) {
e.printStackTrace();
} finally {
// exec = null;
}
}
protected abstract Set<String> processPage(String url);
private void saveUncrawled(List<Runnable> uncrawled) {
for (Runnable task : uncrawled)
urlsToCrawl.add(((CrawlTask) task).getPage());
}
private void submitCrawlTask(String u) {
try {
Thread.sleep(Delay);
exec.execute(new CrawlTask(u));
} catch (Exception e) {
if (e instanceof InterruptedException)
Thread.currentThread().interrupt();
}
}
private class CrawlTask implements Runnable {
private final String url;
CrawlTask(String url) {
this.url = url;
}
boolean alreadyCrawled() {
return seen.putIfAbsent(url, true) != null;
}
void markUncrawled() {
seen.remove(url);
}
public void run() {
if (alreadyCrawled()) {
return;
}
Set<String> list = processPage(url);
if (url.toString().contains("xyzcode")
&& url.toString().contains("html") && list.size() == 0) {
markUncrawled();
problem.merge(url.toString(), 1, (v1, v2) -> v1 = v1 + 1);
return;
}
for (String link : list) {
if (Thread.currentThread().isInterrupted())
return;
//BFS search
submitCrawlTask(link);
}
}
public String getPage() {
return url;
}
}
}
class TrackingExecutor extends AbstractExecutorService {
private final ExecutorService exec;
private final Set<Runnable> tasksCancelledAtShutdown = Collections
.synchronizedSet(new HashSet<Runnable>());
public TrackingExecutor(ExecutorService exec) {
this.exec = exec;
}
public void shutdown() {
exec.shutdown();
}
public List<Runnable> shutdownNow() {
return exec.shutdownNow();
}
public boolean isShutdown() {
return exec.isShutdown();
}
public boolean isTerminated() {
return exec.isTerminated();
}
public boolean awaitTermination(long timeout, TimeUnit unit)
throws InterruptedException {
return exec.awaitTermination(timeout, unit);
}
public List<Runnable> getCancelledTasks() {
if (!exec.isTerminated())
throw new IllegalStateException(/* ... */);
return new ArrayList<Runnable>(tasksCancelledAtShutdown);
}
public void execute(final Runnable runnable) {
exec.execute(new Runnable() {
public void run() {
try {
runnable.run();
} finally {
if (isShutdown() && Thread.currentThread().isInterrupted())
tasksCancelledAtShutdown.add(runnable);
}
}
});
}
}
class HTMLLinkExtractor {
private Pattern patternTag, patternLink;
private Matcher matcherTag, matcherLink;
private static final String HTML_A_TAG_PATTERN = "(?i)<a([^>]+)>(.+?)</a>";
private static final String HTML_A_HREF_TAG_PATTERN = "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
public HTMLLinkExtractor() {
patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
}
/**
* Extract href String with regular expression
*
*/
public Set<String> grabHTMLLinks(final String html) {
Set<String> result = new HashSet<String>();
matcherTag = patternTag.matcher(html);
while (matcherTag.find()) {
String href = matcherTag.group(1); // href
matcherLink = patternLink.matcher(href);
while (matcherLink.find()) {
String link = matcherLink.group(1); // link
link = link.replaceAll("'", "");
link = link.replaceAll("\"", "");
result.add(link);
}
}
return result;
}
}