超简单的 web 爬虫程序,不过可以在他基础之上改造一下,写出强大点的爬虫!
谢谢提供程序的 blog 友!
/**
* @author jack.wang
*
*/
import java.io.bufferedreader;
import java.io.inputstreamreader;
import java.net.url;
import java.util.arraylist;
import java.util.hashmap;
import java.util.hashset;
import java.util.linkedhashset;
import java.util.regex.matcher;
import java.util.regex.pattern;
// 搜索web爬行者
public class searchcrawler implements runnable {
/*
* disallowlistcache缓存robot不允许搜索的url。 robot协议在web站点的根目录下设置一个robots.txt文件,
* 规定站点上的哪些页面是限制搜索的。
* 搜索程序应该在搜索过程中跳过这些区域,下面是robots.txt的一个例子:
* # robots.txt for http://somehost.com/ user-agent:
* disallow: /cgi-bin/
* disallow: /registration # disallow robots on registration page
* disallow: /login
*/
private hashmap> disallowlistcache = new hashmap>();
arraylist errorlist = new arraylist();// 错误信息
arraylist result = new arraylist(); // 搜索到的结果
string starturl;// 开始搜索的起点
int maxurl;// 最大处理的url数
string searchstring;// 要搜索的字符串(英文)
boolean casesensitive = false;// 是否区分大小写
boolean limithost = false;// 是否在限制的主机内搜索
public searchcrawler(string starturl, int maxurl, string searchstring) {
this.starturl = starturl;
this.maxurl = maxurl;
this.searchstring = searchstring;
}
public arraylist getresult() {
return result;
}
public void run() {// 启动搜索线程
crawl(starturl, maxurl, searchstring, limithost, casesensitive);
}
// 检测url格式
private url verify {
// 只处理http urls.
if (!url.tolowercase().startswith("http://"))
return null;
url verifiedurl = null;
try {
verifiedurl = new ;
} catch (exception e) {
return null;
}
return verifiedurl;
}
// 检测robot是否允许访问给出的url.
private boolean isrobotallowed(url urltocheck) {
string host = urltocheck.gethost().tolowercase();// 获取给出rul的主机
// system.out.println("主机=" host);
// 获取主机不允许搜索的url缓存
arraylist disallowlist = disallowlistcache.get(host);
// 如果还没有缓存,下载并缓存。
if (disallowlist == null) {
disallowlist = new arraylist();
try {
url robotsfileurl = new ;
bufferedreader reader = new bufferedreader(
new inputstreamreader(robotsfileurl.openstream()));
// 读robot文件,创建不允许访问的路径列表。
string line;
while ((line = reader.readline()) != null) {
if (line.indexof("disallow:") == 0) {// 是否包含"disallow:"
string disallowpath = line.substring("disallow:"
.length());// 获取不允许访问路径
// 检查是否有注释。
int commentindex = disallowpath.indexof("#");
if (commentindex != -1) {
disallowpath = disallowpath.substring(0,
commentindex);// 去掉注释
}
disallowpath = disallowpath.trim();
disallowlist.add(disallowpath);
}
}
// 缓存此主机不允许访问的路径。
disallowlistcache.put(host, disallowlist);
} catch (exception e) {
return true; // web站点根目录下没有robots.txt文件,返回真
}
}
string file = urltocheck.getfile();
// system.out.println("文件getfile()=" file);
for (int i = 0; i < disallowlist.size(); i ) {
string disallow = disallowlist.get(i);
if (file.startswith(disallow)) {
return false;
}
}
return true;
}
private string downloadpage(url pageurl) {
try {
// open connection to url for reading.
bufferedreader reader = new bufferedreader(new inputstreamreader(
pageurl.openstream()));
// read page into buffer.
string line;
stringbuffer pagebuffer = new stringbuffer();
while ((line = reader.readline()) != null) {
pagebuffer.append(line);
}
return pagebuffer.tostring();
} catch (exception e) {
}
return null;
}
// 从url中去掉"www"
private string removewwwfrom {
int index = url.indexof("://www.");
if (index != -1) {
return url.substring(0, index 3) url.substring(index 7);
}
return (url);
}
// 解析页面并找出链接
private arraylist retrievelinks(url pageurl, string pagecontents,
hashset crawledlist, boolean limithost) {
// 用正则表达式编译链接的匹配模式。
pattern p = pattern.compile("]",
pattern.case_insensitive);
matcher m = p.matcher(pagecontents);
arraylist linklist = new arraylist();
while (m.find()) {
string link = m.group(1).trim();
if (link.length() < 1) {
continue;
}
// 跳过链到本页面内链接。
if (link.charat(0) == '#') {
continue;
}
if (link.indexof("mailto:") != -1) {
continue;
}
if (link.tolowercase().indexof("javascript") != -1) {
continue;
}
if (link.indexof("://") == -1) {
if (link.charat(0) == '/') {// 处理绝对地
link = "http://" pageurl.gethost() ":"
pageurl.getport() link;
} else {
string file = pageurl.getfile();
if (file.indexof('/') == -1) {// 处理相对地址
link = "http://" pageurl.gethost() ":"
pageurl.getport() "/" link;
} else {
string path = file.substring(0,
file.lastindexof('/') 1);
link = "http://" pageurl.gethost() ":"
pageurl.getport() path link;
}
}
}
int index = link.indexof('#');
if (index != -1) {
link = link.substring(0, index);
}
link = removewwwfrom;
url verifiedlink = verify;
if (verifiedlink == null) {
continue;
}
/* 如果限定主机,排除那些不合条件的url */
if (limithost
&& !pageurl.gethost().tolowercase().equals(
verifiedlink.gethost().tolowercase())) {
continue;
}
// 跳过那些已经处理的链接.
if (crawledlist.contains(link)) {
continue;
}
linklist.add(link);
}
return (linklist);
}
// 搜索下载web页面的内容,判断在该页面内有没有指定的搜索字符串
private boolean searchstringmatches(string pagecontents,
string searchstring, boolean casesensitive) {
string searchcontents = pagecontents;
if (!casesensitive) {// 如果不区分大小写
searchcontents = pagecontents.tolowercase();
}
pattern p = pattern.compile("[\\s] ");
string[] terms = p.split(searchstring);
for (int i = 0; i < terms.length; i ) {
if (casesensitive) {
if (searchcontents.indexof(terms[i]) == -1) {
return false;
}
} else {
if (searchcontents.indexof(terms[i].tolowercase()) == -1) {
return false;
}
}
}
return true;
}
// 执行实际的搜索操作
public arraylist crawl(string starturl, int maxurls,
string searchstring, boolean limithost, boolean casesensitive) {
hashset crawledlist = new hashset();
linkedhashset tocrawllist = new linkedhashset();
if (maxurls < 1) {
errorlist.add("invalid max urls value.");
system.out.println("invalid max urls value.");
}
if (searchstring.length() < 1) {
errorlist.add("missing search string.");
system.out.println("missing search string");
}
if (errorlist.size() > 0) {
system.out.println("err!!!");
return errorlist;
}
// 从开始url中移出www
starturl = removewwwfrom;
tocrawllist.add(starturl);
while (tocrawllist.size() > 0) {
if (maxurls != -1) {
if (crawledlist.size() == maxurls) {
break;
}
}
// get url at bottom of the list.
string url = tocrawllist.iterator().next();
// remove url from the to crawl list.
tocrawllist.remove(url);
// convert string url to url object.
url verifiedurl = verify;
// skip url if robots are not allowed to access it.
if (!isrobotallowed(verifiedurl)) {
continue;
}
// 增加已处理的url到crawledlist
crawledlist.add(url);
string pagecontents = downloadpage(verifiedurl);
if (pagecontents != null && pagecontents.length() > 0) {
// 从页面中获取有效的链接
arraylist links = retrievelinks(verifiedurl,
pagecontents, crawledlist, limithost);
tocrawllist.addall(links);
if (searchstringmatches(pagecontents, searchstring,
casesensitive)) {
result.add(url);
system.out.println(url);
}
}
}
return result;
}
// 主函数
public static void main(string[] args) {
searchcrawler crawler = new searchcrawler("http://www.blogjava.net/jack2007/", 20,"jack");
thread search = new thread(crawler);
system.out.println("start searching...");
system.out.println("result:");
search.start();
try {
search.join();
} catch (interruptedexception e) {
// todo auto-generated catch block
e.printstacktrace();
}
}
}
本博客为学习交流用,凡未注明引用的均为本人作品,转载请注明出处,如有凯发k8网页登录的版权问题请及时通知。由于博客时间仓促,错误之处敬请谅解,有任何意见可给我留言,愿共同学习进步。