|
用Java+MySQL+PHP轻松构建跨平台的搜索引擎
发表日期:2008-1-5
|
此搜索引擎适于在一个中等规模的局域网中使用,由于找到的网页存在数据库中,不仅可以索静态的Html页面,可以搜索PHP、ASP等动态页面。对于一个拥有5万个网页的系统(使用PII-400作为服务器),搜索响应时间在2-10秒左右,完全可以满足要求,由于Java、mysql、PHP都是跨平台的软件,所以此搜索引擎不仅可以工作在Windows服务器上,而且也可以工作在Linux等其他系统中。
一、建立搜索引擎需要的数据库和数据表。
首先建立数据库:
c:\mysql\bin\> mysqladmin -uroot -pmypasswd create Spider
然后建立数据库中的表结构
c:\mysql\bin\> mysql -uroot -pmypasswd Spider < Spider.mysql
其中Spider.mysql为一个文本文件,其内容如下:
CREATE TABLE link ( Id int(10) unsigned NOT NULL auto_increment, Url varchar(120) NOT NULL, Class tinyint(3) unsigned NOT NULL default 0 , IsSearchLink tinyint(3) unsigned default 0, PRIMARY KEY (Url), UNIQUE Id (Id), KEY Url (Url), KEY Class (Class) );
# 本局域网的初始主页地址,搜索蜘蛛从此网址开始搜索所有其他网页
INSERT INTO link VALUES( '1', 'HTTP://102.211.69.1/', '0', '0');
# 数据表 webpagelocal 用来存放下载的所有的网页
CREATE TABLE webpagelocal ( Id int(10) unsigned NOT NULL auto_increment, Url varchar(120) NOT NULL, Content text NOT NULL, PRIMARY KEY (Url), UNIQUE Id (Id), KEY Url (Url) );
# 数据表 webpagefindfast
# 用MakeFast.php从表webpagelocal中提取512字节的检索信息存放其中
CREATE TABLE webpagefindfast ( Id int(10) unsigned NOT NULL, Url varchar(120) NOT NULL, Title varchar(64), Content blob, PRIMARY KEY (Url), KEY Url (Url), KEY Title (Title) );
二、以下为搜索网页和下载网页至本地数据库的Java程序LinkToDB.java,它也是此搜索引擎的核心和基础
/***************************** LinkToDB.java *********************************** * * 对URL中的http链接进行分析,将相对路径转换为绝对路径,排序方式输出结果到数据库 * * 假如分析得到的URL是Link表中唯一的,就将其内容下载到表 WebPageLocal 中。 * ******************************************************************************** / import java.io.*; import java.util.*; import java.net.*; import java.lang.String; import java.sql.*; import java.text.*;
class Counter { private int i = 1; int read() { return i; } void increment() { i++; } }
public class LinkToDB { String UrlHost = ""; String UrlFile = ""; String UrlPath = ""; static String StartWith = null; boolean outsideTag = true; //判定是否在标记之中 static char[] buffer = new char[4096]; // 缓冲区:用于保存从 URL 读的数据 InputStreamReader read = null; BufferedReader reader = null; URLConnection UC = null; private URL url = null; private StreamTokenizer st; private TreeMap counts = new TreeMap();//以排序方式保存找到的链接
LinkToDB(String myurl,String StartOnly){ try { StartWith = StartOnly; if(StartOnly!=null) { if(!myurl.startsWith(StartOnly)) return; }//只搜索此网站 url = new URL(myurl); UrlHost = url.getHost(); UrlHost = UrlHost.toUpperCase(); UrlFile = url.getFile(); int v=UrlFile.lastIndexOf("/"); if(v!=-1) UrlPath = UrlFile.substring(0,v); System.out.println("分析文件:"+myurl); int uclength=200000; int ucError=0; try{ uc = url.openConnection(); uc.setUseCaches(false); uc.connect(); } catch(IOException io) { ucError=1; System.out.println("打不开待分析网页:"+myu rl); } if(ucError!=1){ uclength = uc.getContentLength(); if (uclength<200000) { try{ read = new InputStreamReader(url.openStream()); } catch(IOException io) {System.out.println("流打开错误:"+myurl);} } else System.out.println("文件太大,不分析"); } if(read!=null){ reader=new BufferedReader(read); if(reader!=null){ st = new StreamTokenizer(reader); st.resetSyntax(); // 重置语法表 st.WordChars(0,255); // 令牌范围为全部字符 st.ordinaryChar('<'); // HTML标记两边的分割符 st.ordinaryChar('>'); } } } catch(MalformedURLException e){ System.out.println("Malformed URL String!");} } void cleanup() { try { read.close(); } catch(IOException e) { System.out.println("流关闭错误"); } } void countWords() { try { while(st.nextToken()!=StreamTokenizer.TT_EOF) { String s0=""; String s_NoCase=""; switch(st.ttype) { case '<': //入标记字段 outsideTag=false; continue; //countWords(); case '>': //出标记字段 outsideTag=true; continue; //countWords(); case StreamTokenizer.TT_EOL: s0 = new String("EOL"); break; case StreamTokenizer.TT_WORD: if(!outsideTag) s0 = st.sval; /*已经是字符 串*/ break; default: s0 = "";// s0 = String.valueOf((char)st.ttype);/*单一字符*/ } if(outsideTag) continue;//出了标记区域(<a >) String s = ""; s_NoCase = s0.trim(); s0=s_NoCase.toUpperCase(); if(s0.startsWith("A ")s0.startsWith("AREA ")s0.startsWith("FRAME ")s0.s tartsWith("IFRAME ")){ //以这些开始的都是超级链接 int HREF_POS = -1; if(s0.startsWith("FRAME ")s0.startsWith("IFRAME ")) { HREF_POS = s0.indexOf("SRC="); s0 = s0.substring(HREF_POS+4).trim(); s_NoCase=s_NoCase.substring(HREF_POS+4).trim(); } else { HREF_POS=s0.indexOf("HREF="); s0=s0.substring(HREF_POS+5).trim(); s_NoCase=s_NoCase.substring(HREF_POS+5).trim(); } if(HREF_POS!=-1) { if(s0.startsWith("\"")) {s0=s0.substring(1);s_NoCase=s_NoCase.substring(1);} int QUOTE=s0.indexOf("\""); if(QUOTE!=-1) {s0=s0.substring(0,QUOTE).trim();s_NoCase=s_NoCase.substring(0,QUOTE).trim ();} int SPACE=s0.indexOf(" "); if(SPACE!=-1) {s0=s0.substring(0,SPACE).trim();s_NoCase=s_NoCase.substring(0,SPACE).trim ();} if(s0.endsWith("\"")) {s0=s0.substring(0,s0.length()-1);s_NoCase=s_NoCase.substring(0,s_NoCase.l ength()-1);} if(s0.indexOf("'")!=-1s0.indexOf("javascript:")!=-1s0.indexOf("..")!=-1 ) {s0="";s_NoCase="";} //有这些符号,认为非合法链接;两点表示上一目录,而我 只想向下级查找 if ( !s0.startsWith("FTP://") &&//以下后缀或前缀通常非网页格式 !s0.startsWith("FTP://") && !s0.startsWith("MAILTO:") && !s0.endsWith(".SWF") && !s0.startsWith("../")) //因../表示上一目录,通常只需考虑本级和下N级目录 s=s0; if (!s.startsWith("HTTP://")&&!s.equals("")) {s=UrlHost+UrlPath+"/"+s;s_No Case=UrlHost+UrlPath+"/"+s_NoCase;} else if(s.startsWith("/")) {s=UrlHost+s;s_NoCase=UrlHost+s_NoCase;} if(s.startsWith("HTTP://")) {s=s.substring(7);s_NoCase=s_NoCase.substring( 7);} int JinHao=s.indexOf("#"); //假如含有"#"号,表示有效的链接是此前的部分 if(JinHao!=-1) {s=s.substring(0,JinHao).trim();s_NoCase=s_NoCase.substring( 0,JinHao).trim();} int H=-1; //以下将/./转换为/ for(int m=0;m<4;m++){ H=s.indexOf("/./"); if(H!=-1) {s=s.substring(0,H)+s.substring(H+2);s_NoCase=s_NoCase.substring (0,H)+s_NoCase.substring(H+2);} } int TwoXG=-1; //以下将//转换为/ for(int m=0;m<5;m++){ TwoXG=s.indexOf("//"); if(TwoXG!=-1) {s=s.substring(0,TwoXG)+s.substring(TwoXG+1);s_NoCase=s_NoCa se.substring(0,TwoXG)+s_NoCase.substring(TwoXG+1);} } int OneXG=s.indexOf("/"); if(OneXG==-1) {s=s+"/";s_NoCase+="/";} //将xx.xx.xx.xxx转换为xx.xx.xx.xxx/的 标准形式 if (!s.startsWith("HTTP://")) {s="HTTP://"+s;s_NoCase="HTTP://"+s_NoCase;} } } if(counts.containsKey(s_NoCase)) ((Counter)counts.get(s_NoCase)).increment(); else counts.put(s_NoCase,new Counter()); } } catch(IOException e) { System.out.println("st.nextToken() unsuccessful"); } }
Collection values() { return counts.values(); } Set keySet() { return counts.keySet(); } Counter getCounter(String s) { return (Counter)counts.get(s); }
public static void main(String[] argv) throws FileNotFoundException { try{ Class.forName("org.gjt.mm.mysql.Driver").newInstance(); } catch (Exception E) { System.out.println("加载Jdbc驱动程序失败"); E.printStackTrace(); } try{ Connection conn = DriverManager.getConnection( "jdbc:mysql://localhost/Spider?user=root&password=mypassword"); Statement stmt = conn.createStatement(); String myurl; for(int i=1;i<=6;i++){ String query = "SELECT Url FROM link WHERE IsSearchLink=0 and Class="+(i-1)+" ORDER BY Url"; ResultSet rs = stmt.executeQuery(query); while (rs.next()) { myurl = rs.getString("Url"); String StartOnly = null; if(argv.length>0) StartOnly=argv[0]; LinkToDB wc = new LinkToDB(myurl,StartOnly); if(wc.reader!=null){ stmt.executeUpdate("UPDATE Link SET IsSearchLink=1 WHERE Class="+(i-1)+" a nd Url='"+myurl+"'"); wc.countWords(); Iterator keys = wc.keySet().iterator(); while(keys.hasNext()) { String key = (String)keys.next(); System.out.println("分析找到链接:"+key + ": "+ wc.getCounter(key).read()); int ErrorDB=0; if(StartWith==null(StartWith!=null&&key.startsWith(StartWith))){ try{//-------------------------- 找到的链接插入数据库link ----------------- -- stmt.executeUpdate("INSERT INTO Link(Id,Url,Class) VALUES(0,'"+key+"',"+ i+")"); } catch(SQLException ex){ ErrorDB=1; System.out.println("插入数据错 SQLException: " + ex.getMessage()) ; } if(ErrorDB!=1){ //链接不重复就下载网页到WebPageLocal System.out.println("下载网页:"+key); int length; // 读的字符数 int filelength=200000; InputStreamReader read=null; URL rurl=null; URLConnection urlc=null; String Content=""; try{ rurl = new URL(key); } catch(MalformedURLException mu) { System.out.println("打开下载网页出错:"+mu.getMessage()); } if(rurl!=null){ int ucError=0; try{ urlc = rurl.openConnection(); urlc.connect();} catch(IOException io) { ucError=1; System.out.println("下载网页打不开:"+ke y); } if(ucError==0){ try{ filelength=urlc.getContentLength(); if (filelength>=200000) System.out.println("网页太大,我不下载了。"+key); else read = new InputStreamReader(rurl.openStream()); } catch(IOException io) {System.out.println("下载网页打不开:"+key);} } // 读入 URL 并写入数据库 if(read!=null&&filelength<200000){ try{ while((length = read.read(buffer)) != -1) { String s = new String(buffer, 0, length); Content=Content+s; } } catch(IOException io) { Content=""; System.out.println("不能读入URL文件"); } try{ Statement stmt2 = conn.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE, Res ultSet.CONCUR_UPDATABLE); ResultSet uprs = stmt2.executeQuery("SELECT Id,Url,Content FROM WebPageLoca l WHERE 0");//WHERE 0很重要,否则会耗尽内存 uprs.moveToInsertRow(); uprs.updateInt("Id",0); uprs.updateString("Url",key); uprs.updateString("Content",Content); uprs.insertRow(); uprs.beforeFirst(); uprs.close(); stmt2.close(); } catch(SQLException ex){ System.out.println("插入数据错:" + ex.getMessage()); } }//if(read!=null&&filelength<200000) }// } } //------------------------------- 下载网页 ---------------------------------- }//while(keys.hasNext()) wc.cleanup(); }//if(wc.reader!=null) }//while rs.next rs.close(); //关闭记录结果 }//end for stmt.close(); //关闭语句 conn.close(); //关闭连接 }//try catch(SQLException ex){ System.out.println("SQL异常:" + ex.getMessage()); } }//main()函数结束 }//类 LinkToDB 结束
三、编译和运行此Java程序
d:\Spider\> set CLASSPATH=d:\j\mm.mysql.jdbc2; d:\Spider\> d:\j\bin\javac LinkToDB.java d:\Spider\> d:\j\bin\java LinkToDB
其中第一行命令是设置MySQL的JDBC驱动程序路径。
四、由于网页中含有大量的无用的格式信息,直接用它来搜索要浪费大量的时间,所以需要掉其中的HTML格式控制信息,并将太长的网页截短,然后将整理后的用于搜索的信息存到另一个数据表中。由于PHP4中有一个很方便的函数strip_tags可以去掉其中的HTML格式标记,所以我们用PHP来整理。
MakeFast.php的内容如下:
<?php mysql_connect("localhost","root","mypassword"); $result = mysql_db_query("Spider","select Id,Url,Content from WebPageLocal whe re Id>$n1 and Id<$n2"); while($mt = mysql_fetch_array($result)){ $Title = ""; $Body = ""; $mt2 = strtoupper($mt[2]); $PosTitleL = strpos($mt2,"<TITLE>"); $PosTitleR = strpos($mt2,"</TITLE>"); $PosBody = strpos($mt2,"<BODY"); $PosHeadR = strpos($mt2,"</HEAD>"); if($PosTitleL&&$PosTitleR) $Title = substr($mt[2],$PosTitleL+7,$PosTitl eR-$PosTitleL-7); $Title = eregi_replace("'","’",$Title); if($PosBody) $Body = substr($mt[2],$PosBody); else if($PosHeadR) $Body = substr($mt[2],$PosHeadR+7); else if($PosTitleR) $Body = substr($mt[2],$PosTitleR+8); else if($PosTitleL) $Body = substr($mt[2],$PosTitleL); else $Body = $mt[2]; $BodyText = strip_tags($Body); $BodyNoSpace = eregi_replace(" ","",$BodyText); $BodyNoQuote = eregi_replace("'","",$BodyNoSpace); $Body512 = substr($BodyNoQuote,0,511)." "; $Id = $mt[0]; $Url = $mt[1]; $sql="Insert Into WebPageFindFast(Id,Url,Title,Content) VALUES($Id,'$Url','$Title','$Body512')"; mysql_db_query("Spider",$sql) or die($sql); echo $Id." "; } ?>
使用方式:
在浏览器中输入http://mywebsite/Spider/MakeFast.php?n1=1&n2=10000
五、以上是建立搜索引擎所用到的数据,下面编制用于用户搜索的网页和PHP脚本文件。首先是用于搜索的表单页面SearchForm.htm,其内容如下。
<html> <head> <title>红蜘蛛搜索引擎-V0.1</title> <meta http-equiv="Content-Type" content="text/html; charset=gb2312"> <link rel="stylesheet" href="../All.Css" type="text/css"> </head>
<body bgcolor="#eeffee" text="#000000"> <table width="600" border="0" cellspacing="2" cellpadding="2" align="center" bgc olor="#99CC00"> <tr> <td> <div align="center"><font color="#FF0000"><b><span class="pt16">红蜘蛛搜索 引擎</span> <span class="pt12">V0.1</span></b></font></div> </td> </tr> </table> <form name="form1" method="post" action="Search.php">
<table width="600" border="1" cellspacing="1" cellpadding="1" align="center" b ordercolor="#99CC00"> <tr> <td>
<div align="left"><span class="pt15"><font color="#FF0000"><b>关 键 字</ b>:</font></span> <input type="text" name="KeyWords" size="40" maxlength="40"> </div> </td> <td>
<div align="left"><span class="pt15"><font color="#FF0000"><b>查找范围</ b>:</font></span> <select name="SearchIn"> <option value="Content" selected>网页正文</option> <option value="Title">网页标题</option> </select> </div> </td> </tr> <tr>
<td colspan="2"><span class="pt15"><font color="#FF0000"><b>查找方式</b>: </font></span> <select name="Speed"> <option value="Fast" selected>快速查找</option> <option value="Slow">更深查找</option> </select> </td> </tr> <tr> <td colspan="2"> <div align="left"> <input type="submit" name="Submit" value="搜索"> </div> </td> </tr> </table> </form> <table width="600" border="0" cellspacing="2" cellpadding="2" align="center"> <tr> <td height="18"> <p class="pt12"><font color="#FF0000"><b>使用方法</b></font>:仅需在要害字 一栏输入查询内容并按回车键(Enter)即可。 </p> <p align="left" class="pt12">假如希望输入多个条件,只需要用空格分隔即可: </p> <p class="pt12">例如:要查询同时包含“西昌”和“卫星”的网页,只需输入[西 昌 卫星]。</p> <p class="pt12">又如:要查询只包含要害字“西昌”而不包含“卫星”的网页, 只需要输入[西昌 -卫星]。注重中间的空格不能少。</p>
<p class="pt12"><font color="#FF0000"><b>查找范围</b></font>:你可以选择从 “网页标题”中查找或者从“网页正文”中查找。</p> <p class="pt12"><font color="#FF0000"><b>查找方式</b></font>:“快速查找” 速度快但找到的网页数可能较少,因为:</p> <p class="pt12">“快速查找”只搜索网页正文的前512个字符。</p> <p class="pt12">“更深查找”搜索网页正文的前2048个字符。</p> <div align="right"><a href="mailto:zdyhlp@263.net"><font color="#FF0000" class="pt13"><b>欢迎 提出宝贵意见</b></font></a></div>
</td> </tr> </table> </body> </html>
search.php根据用户输入的条件,完成搜索,显示找到的网页的链接地址、标题和提要信息 。内容如下:
<TITLE>红蜘蛛正在搜索要害词为[<?php echo $KeyWords?>]的网页</TITLE> <link rel="stylesheet" href="../All.css" type="text/css"> <body bgcolor="#eeffee"> <table width="96%" border="0" cellspacing="2" cellpadding="2" align="center" bgc olor="#99CC00"> <tr> <td> <div align="center"><font color="#FF0000"><b><span class="pt16">红蜘蛛搜索 引擎</span> <span class="pt12">V0.1</span></b></font></div> </td> </tr> </table> <form name="form1" method="post" action="Search.php"> <table width="96%" border="1" cellspacing="1" cellpadding="1" align="center" b ordercolor="#99CC00"> <tr> <td valign="top"> <font color="#FF0000"><b><span class="pt13">要害字</span ></b><span class="pt13">:</span></font> <input type="text" name="KeyWords" value="<?php echo $KeyWords?>" size=" 30" maxlength="30"> <input type="submit" name="Submit" value="重新搜索"> </td> <td valign="top"><font color="#FF0000"><b><span class="pt13">查找范围</spa n></b><span class="pt13">:</span></font> <select name="SearchIn"> <option value="Content" <?php if ($SearchIn=="Content") echo "selected ";?>>网页正文</option> <option value="Title" <?php if ($SearchIn=="Title") echo "selected ";?>>网页标题</option> </select> </td> <td valign="top"> <div align="left"><font color="#FF0000"><b><span class="pt13">查找方式</ span></b><span class="pt13">:</span></font> <select name="Speed"> <option value="Fast" <?php if ($Speed=="Fast") echo "selected";?>>快 速查找</option> <option value="Slow" <?php if ($Speed=="Slow") echo "selected";?>>更 深查找</option> </select> </div> </td> </tr> </table> </form> <?php if($SearchIn=="Title") $SQL="SELECT Id,Url,Title,Content FROM WebPageFindFast WHERE "; else $SQL="SELECT Id,Url,Title,Content FROM WebPageFind$Speed WHERE "; $KeyWords=str_replace(" ", " ", $KeyWords); if($KeyWords=="") {echo "要害字不能为空"; exit();} $tok = strtok($KeyWords," "); $i=0; $j=0; while($tok) { $i++; $tok = strtok(" "); } $key = strtok($KeyWords," "); while($key) { $j++; if(substr($key,0,1)!="-") { $SQL=$SQL.$SearchIn." LIKE '%".$key."%' "; $Words[]=$key; } else { $SQL=$SQL.$SearchIn." NOT LIKE '%".substr($key,1)."%' "; } if($j<$i) $SQL.=" AND "; $key = strtok(" "); } if($CurPos!="") $SQL.=" AND Id>=$CurPos "; $SQL.=" LIMIT 100"; //echo "\$SQL=".$SQL." "; mysql_connect("localhost","root","mypassword"); $result=mysql_db_query("Spider",$SQL); $RowCount=mysql_num_rows($result); $FindCount=0; ?> <table border=0 align=center width="96%"> <tr> <th nowrap width="41%"> <div align="left" class="pt12">共找到要害字为 <font color=red> <?php echo $KeyWords?> </font> 的网页共 <font color=red> <?php echo $RowCount;?> </font> 个</div> </th> <td nowrap> </td> </tr> <tr bgcolor="#FF0000"> <th nowrap colspan="2" height="3"></th> </tr> <?php while($row= mysql_fetch_array($result)){ $Pos=$row[0]; $FindCount++; if($FindCount>20) break;?> <tr> <td nowrap colspan="2"> <?php echo $FindCount;?> <a href="<?php echo $row[1]?>" target=_black> <?php if($row[2]!="") echo $row[2]; else echo substr($row[3],0,64); ?> </a></td> </tr> <tr> <td colspan="2" ><span class="pt13">摘要:</span> <?php if($SearchIn=="Title") { $ZhaiYao=substr($row[3],0,1024); } else{ if($Speed=="Fast") $ZhaiYao=$row[3]; else{ $RowLen=strlen($row[3]); if ($RowLen<1024) { $ZhaiYao=$row[3]; } else { $CutPos=0; $PosWord1=strpos($row[3],$Words[0]); if($PosWord1-512<0) $ZhaiYao=substr($row[3],0,1024); else { for($i=24;$i<500;$i++){ //避免将中文字符从半个字处截断,选择从英文处截断 if(ord(substr($row[3],$PosWord1-$i,1))<128) {$CutPos=$i;break;} } $ZhaiYao=substr($row[3],$PosWord1-$CutPos,1024); } } } for($i=0;$i<count($Words);$i++){ $ZhaiYao=str_replace($Words[$i],"<font color=red>".$Words[$i]."</font>", $ZhaiY ao); } } echo $ZhaiYao; ?> </td> </tr> <tr> <td colspan="2" align="right"><a href="One.php?num=<?php echo $row[0]?>" tar get=_black> <font color="#0033FF" class="pt12">本地镜像</font></a></td> </tr> <tr bgcolor="#999933"> <td nowrap colspan="2" height="1"></td> </tr> <?php } ?> <?php if($RowCount>20){ ?> <tr> <td align="right" colspan="2" height="10"> <form name="form2" method="post" action="Search.php"> <input type="hidden" name="KeyWords" value="<?php echo $KeyWords;?>"> <input type="hidden" name="SearchIn" value="<?php echo $SearchIn;?>"> <input type="hidden" name="Speed" value="<?php echo $Speed;?>"> <input type="hidden" name="CurPos" value="<?php echo $Pos;?>"> <input type="submit" name="Submit" value="下20个网页"> </form> </td> </tr> <?php } ?> </table>
One.php用于从本地镜像中显示一个找到的网页。由于网页的原始信息已经在WebPageLocal中存储,所以 只需简单的读出,发给用户。
<?php mysql_connect("localhost","root","mypassword"); $sql="select Url,Content from WebPageLocal where "; if($num!="") $sql=$sql."Id=$num"; else exit(); $result=mysql_db_query("Spider",$sql); $mt=mysql_fetch_row($result); echo $mt[1]; ?>
|
|
上一篇:Java语言中两种异常的差别
人气:483
下一篇:漫谈Hibernate的前世今生
人气:481 |
浏览全部Java的内容
Dreamweaver插件下载 网页广告代码 祝你圣诞节快乐 2009年新年快乐
|
|