数值)被Alexa加入了很多干扰元素,如果只是要将数据(Alexa排名数值)显示在页面倒没有什么问题
,若是要对数据进行处理比如将两个网站的排名数值进行比较或者将网站排名数值存入数据库供日后调
用,则要对抓取到的数据进行适当的处理。
以下是本人结合其他网友提供的代码,对抓取到的数据进行处理后获得干干净净的数值的方法。核心
函数代码如下: <%
'// alexa 世界排名的查询页面为:
'// 以下函数抓取到含有干扰元素的数据并通过函数对数据进行处理,获得干干净净的Alexa排名数值
Function alexa(str)
url="
strs=str
If IsObjInstalled("AspHTTP.Conn")=true Then
str= getaspHTTPPage(url)
else
str= getHTTPPage(url)
End if
if str="" then
Call Error()
else
str_=str
str1=""
set reg=new Regexp
reg.Multiline=True
reg.Global=True
reg.IgnoreCase=true
str_top="<!--Did you know"
str_bottom="</span>"
reg.Pattern=""&str_top&"((.| )*?)"&str_bottom&""
Set matches = reg.execute(str_)
str1=""
For Each match1 in matches
str1=str1&match1.Value&"***"
Next
Set matches = Nothing
Set reg = Nothing
IF str1 <> "" Then
str1 = Replace(str1,"<!--Did you know? Alexa offers this data programmatically.
Visit
Service.-->","")
str1 = Replace(str1,"</span>","")
Str_11=split(str1,"<div class=""borderBottom""></div>")
str1 = Str_11(0)
Str_11 = split(str1,"***")
str1_Pan = Str_11(0)
End If
set reg=new Regexp
reg.Multiline=True
reg.Global=True
reg.IgnoreCase=true
str_top="<td class=""traffic"">"
str_bottom="</td>"
reg.Pattern=""&str_top&"((.| )*?)"&str_bottom&""
Set matches = reg.execute(str_)
str1=""
For Each match1 in matches
str1=str1&match1.Value&"***"
Next
Set matches = Nothing
Set reg = Nothing
IF str1 <> "" Then
Str_11=split(str1,"***")
End If
End if
'************************************
'************************************
alexa=getcorrectvalue(str1_Pan)
'************************************
'************************************
End Function
'************************************
'此功能函数去除干扰元素
'************************************
function getcorrectvalue(source)
source="|"+source+"|"
while InStr(source,"<")>0
thestart = InStr(source, "<")
theend = InStr(source, ">")
source = mid(source,1,thestart-1)+right(source,(len(source)-theend))
wend
source=replace(source,"|","")
source=replace(source,",","")
getcorrectvalue=source
end function
'************************************
'************************************
'// <summary>
'// 采用 Microsoft.XMLHTTP 组件采集数据
'// </summary>
Function getHTTPPage(url)
on error resume next
dim http
set http=Server.createobject("Microsoft.XMLHTTP")
Http.open "GET",url,false
Http.send()
if Http.readystate<>4 then
exit function
end if
getHTTPPage=bytes2BSTR(Http.responseBody)
set http=nothing
if err.number<>0 then err.Clear
End function
'// <summary>
'// 采用 ADODB.Stream 处理采集到的数据,把二进制的文件转成文本字符
'// </summary>
Function Bytes2bStr(vin)
Dim BytesStream,StringReturn
Set BytesStream = Server.CreateObject("ADODB.Stream")
BytesStream.Type = 2
BytesStream.Open
BytesStream.WriteText vin
BytesStream.Position = 0
BytesStream.Charset = "GB2312"
BytesStream.Position = 2
StringReturn =BytesStream.ReadText
BytesStream.close
Set BytesStream = Nothing
Bytes2bStr = StringReturn
End Function
'// <summary>
'// 采用 AspHTTP.Conn 组件采集数据
'// </summary>
Function getaspHTTPPage(url)
if url="" then
exit function
end if
Set HttpObj = Server.CreateObject("AspHTTP.Conn")
'设置代理服务器,通过代理上网的用户需要设置此选项
If ProxyIP=1 Then
HttpObj.Proxy="192.168.5.254:808"
end if
HTTPObj.TimeOut = 45
HttpObj.Url = url
HttpObj.RequestMethod = "GET"
getaspHTTPPage = HttpObj.GetURL
set HttpObj=nothing
End function
'//<summary>
'//检查组件,采用xmlhttp抓取网页还是AspHTTP
'//</summary>
Function IsObjInstalled(strClassString)
On Error Resume Next
IsObjInstalled = False
Err = 0
Dim xTestObj
Set xTestObj = Server.CreateObject(strClassString)
If 0 = Err Then
If AspHttpOpen=1 Then
IsObjInstalled = True
'Response.write "当前组件 ASPHTTP"
Else
IsObjInstalled = False
'Response.write "当前组件 XMLHTTP"
End If
Else
IsObjInstalled = False
'Response.write "当前组件 XMLHTTP"
End If
Set xTestObj = Nothing
Err = 0
End Function
Sub Error()
response.write "<BR> 抓取不到数据-可能是因为网络原因不能访问站点<BR><a
href=javascript:location.reload();>重试</a>"
response.end
End Sub
%>
调用方法:
<%
response.write alexa("
%>
<%=alexa("