808
火車采集器
Html代碼白名單過濾插件(C#)
Html代碼白名單過濾插件(C#)
作者:小文 發布於:2010-10-18 9:28 Monday 分類:免費插件
該插件將過濾掉所有除需要外的html代碼.在處理前您需要使用采集器過濾js,css等代碼.您可以下載該插件或是自己修改代碼以達到自己的需要.
插件主要源碼:
using System;
using System.IO;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text;
using System.Windows.Forms;
using System.Collections;
using System.Collections.Generic;
/*注意事項
* 1.這個命名空間不能更改.當然,你可以在其它的文件裏使用其它的命名空間,在這裏使用.
* 2.必須引用采集器目錄下的LeWell.dll文件.
* 3.必須實現 IPlugin接口 裏的 Run 方法
* 4.編寫過程出請對傳入傳出的參數進行檢測
* */
namespace LeWell.Plugins
{
public class Demo : LeWell.Plugins.IPlugin //使用的IPlugin接口,請引用 LeWell.Plugins.dll 文件
{
/// <summary>
/// 插件運行處理預留方法
/// </summary>
/// <param name="str">要處理的源代碼</param>
/// <param name="pageurl">采集頁麵地址</param>
/// <param name="pagetype">頁麵類型為一枚舉 LeWell.Plugins.PageType ,其中List,Pages,Content,Save分別代表列表頁,分頁或多頁,默認頁,保存時</param>
/// <param name="encoding">頁麵編碼</param>
/// <param name="cookies">網站的cookies</param>
/// <returns></returns>
public string Run(string str, string pageurl, LeWell.Plugins.PageType pagetype, Encoding encoding, System.Net.CookieCollection cookies)
{
////請在這裏執行您的操作,返回string
//string result = "這個結果是單頁麵測試,後邊的數據為程序接收到的數據:\r\n\r\n";
//result += "當前的頁麵網址為:" + pageurl.ToString() + "\r\n";
//result += "當前的頁麵類型為:" + pagetype.ToString() + "\r\n";
//result += "當前的網頁編碼為:" + encoding.ToString() + "\r\n";
//result += "當前網頁的代碼為:" + str.ToString() + "\r\n";
return str;
}
/// <summary>
/// 這個是用來在最後處理采集器入庫前的數據的
/// </summary>
/// <param name="ht"></param>
/// <param name="pageurl"></param>
/// <param name="encoding"></param>
/// <param name="cookies"></param>
/// <returns></returns>
public Hashtable Run(Hashtable ht, string pageurl, Encoding encoding, System.Net.CookieCollection cookies)
{
System.Collections.Generic.List<string> list = new System.Collections.Generic.List<string>();
if (ht.ContainsKey("內容"))
{
string data = ht["內容"].ToString();
if (string.IsNullOrEmpty(data)) return ht;
list.Add("div");
list.Add("ul");
list.Add("dt");
list.Add("dl");
list.Add("dd");
list.Add("table");
list.Add("tbody");
list.Add("tr");
list.Add("td");
list.Add("p");
foreach (string s in list)
{
data = System.Text.RegularExpressions.Regex.Replace(data, "<(/?)" + s + "[^>]*?>", "<$1" + s + ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
data = System.Text.RegularExpressions.Regex.Replace(data, "<(/?)" + s + ">", "ASDFGHJKL$1" + s, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
}
data = System.Text.RegularExpressions.Regex.Replace(data, "<br(\\s)?/?>", "QWERTYUIOPB", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
List<KeyValuePair<string, string>> dic = new List<KeyValuePair<string, string>>();
System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex("<a[^>]*?>[^<]*?</a>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.MatchCollection mc = regex.Matches(data);
foreach (System.Text.RegularExpressions.Match m in mc)
{
dic.Add(new KeyValuePair<string, string>(m.Value, base64_encode(m.Value)));
}
regex = new System.Text.RegularExpressions.Regex("<img\\s+[^>]*?>",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
mc = regex.Matches(data);
foreach (System.Text.RegularExpressions.Match m in mc)
{
dic.Add(new KeyValuePair<string, string>(m.Value, base64_encode(m.Value)));
}
data = System.Text.RegularExpressions.Regex.Replace(data,"<[^>]*?>", "");
foreach (KeyValuePair<string, string> kv in dic)
{
data = data.Replace(kv.Value, kv.Key);
}
foreach (string s in list)
{
data = data.Replace("ASDFGHJKL" + s, "<" + s + ">");
data = data.Replace("ASDFGHJKL/" + s, "</" + s + ">");
}
data = data.Replace("QWERTYUIOPB","<br>");
ht["內容"] = data;
}
return ht;
}
/// <summary>
///Base64加密
/// </summary>
/// <returns>返回string</returns>
public string base64_encode(string s)
{
string strResult = "";
if ((s != null) && (s != ""))
{
strResult = Convert.ToBase64String(System.Text.ASCIIEncoding.Default.GetBytes(s));
}
return strResult;
}
/// <summary>
///Base64解密
/// </summary>
/// <returns>返回string</returns>
public string base64_decode(string s)
{
string strResult = "";
if ((s != null) && (s != ""))
{
int buling = s.Length % 4;
if (buling == 3) s = s + "=";
else if (buling == 2) s = s + "==";
else if (buling == 1) s = s.Substring(0, s.Length - 1);
strResult = System.Text.ASCIIEncoding.Default.GetString(Convert.FromBase64String(s));
}
return strResult;
}
}
}
附件下載:
白名單過濾插件.7z 9.21KB
標簽: 過濾
最後更新:2017-05-09 01:05:54