閱讀808 返回首頁    go 火車采集器


Html代碼白名單過濾插件(C#)

Html代碼白名單過濾插件(C#)

作者:小文 發布於:2010-10-18 9:28 Monday 分類:免費插件

該插件將過濾掉所有除需要外的html代碼.在處理前您需要使用采集器過濾js,css等代碼.您可以下載該插件或是自己修改代碼以達到自己的需要.

插件主要源碼:

using System;
using System.IO;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text;
using System.Windows.Forms;
using System.Collections;
using System.Collections.Generic;

/*注意事項
 * 1.這個命名空間不能更改.當然,你可以在其它的文件裏使用其它的命名空間,在這裏使用.
 * 2.必須引用采集器目錄下的LeWell.dll文件.
 * 3.必須實現 IPlugin接口 裏的 Run 方法
 * 4.編寫過程出請對傳入傳出的參數進行檢測
 * */

namespace LeWell.Plugins
{
    public class Demo : LeWell.Plugins.IPlugin   //使用的IPlugin接口,請引用 LeWell.Plugins.dll 文件
    {
        /// <summary>
        /// 插件運行處理預留方法
        /// </summary>
        /// <param name="str">要處理的源代碼</param>
        /// <param name="pageurl">采集頁麵地址</param>
        /// <param name="pagetype">頁麵類型為一枚舉 LeWell.Plugins.PageType ,其中List,Pages,Content,Save分別代表列表頁,分頁或多頁,默認頁,保存時</param>
        /// <param name="encoding">頁麵編碼</param>
        /// <param name="cookies">網站的cookies</param>
        /// <returns></returns>
        public string Run(string str, string pageurl, LeWell.Plugins.PageType pagetype, Encoding encoding, System.Net.CookieCollection cookies)
        {
            ////請在這裏執行您的操作,返回string
            //string result = "這個結果是單頁麵測試,後邊的數據為程序接收到的數據:\r\n\r\n";
            //result += "當前的頁麵網址為:" + pageurl.ToString() + "\r\n";
            //result += "當前的頁麵類型為:" + pagetype.ToString() + "\r\n";
            //result += "當前的網頁編碼為:" + encoding.ToString() + "\r\n";
            //result += "當前網頁的代碼為:" + str.ToString() + "\r\n";
            return str;
        }

      
        /// <summary>
        /// 這個是用來在最後處理采集器入庫前的數據的
        /// </summary>
        /// <param name="ht"></param>
        /// <param name="pageurl"></param>
        /// <param name="encoding"></param>
        /// <param name="cookies"></param>
        /// <returns></returns>
        public Hashtable Run(Hashtable ht, string pageurl, Encoding encoding, System.Net.CookieCollection cookies)
        {
            System.Collections.Generic.List<string> list = new System.Collections.Generic.List<string>();
            if (ht.ContainsKey("內容"))
            {
                string data = ht["內容"].ToString();
                if (string.IsNullOrEmpty(data)) return ht;

                list.Add("div");
                list.Add("ul");
                list.Add("dt");
                list.Add("dl");
                list.Add("dd");
                list.Add("table");
                list.Add("tbody");
                list.Add("tr");
                list.Add("td");
                list.Add("p");

                foreach (string s in list)
                {
                    data = System.Text.RegularExpressions.Regex.Replace(data, "<(/?)" + s + "[^>]*?>", "<$1" + s + ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    data = System.Text.RegularExpressions.Regex.Replace(data, "<(/?)" + s + ">", "ASDFGHJKL$1" + s, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                }

                data = System.Text.RegularExpressions.Regex.Replace(data, "<br(\\s)?/?>", "QWERTYUIOPB", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                List<KeyValuePair<string, string>> dic = new List<KeyValuePair<string, string>>();
                System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex("<a[^>]*?>[^<]*?</a>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                System.Text.RegularExpressions.MatchCollection mc = regex.Matches(data);
                foreach (System.Text.RegularExpressions.Match m in mc)
                {
                    dic.Add(new KeyValuePair<string, string>(m.Value, base64_encode(m.Value)));
                }

                regex = new System.Text.RegularExpressions.Regex("<img\\s+[^>]*?>",System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                mc = regex.Matches(data);
                foreach (System.Text.RegularExpressions.Match m in mc)
                {
                    dic.Add(new KeyValuePair<string, string>(m.Value, base64_encode(m.Value)));
                }

                data = System.Text.RegularExpressions.Regex.Replace(data,"<[^>]*?>", "");

                foreach (KeyValuePair<string, string> kv in dic)
                {
                    data = data.Replace(kv.Value, kv.Key);
                }

                foreach (string s in list)
                {
                    data = data.Replace("ASDFGHJKL" + s, "<" + s + ">");
                    data = data.Replace("ASDFGHJKL/" + s, "</" + s + ">");
               }
                data = data.Replace("QWERTYUIOPB","<br>");
                ht["內容"] = data;
            }
            return ht;
        }

        /// <summary>
        ///Base64加密
        /// </summary>
        /// <returns>返回string</returns>
        public  string base64_encode(string s)
        {
            string strResult = "";

            if ((s != null) && (s != ""))
            {
                strResult = Convert.ToBase64String(System.Text.ASCIIEncoding.Default.GetBytes(s));
            }

            return strResult;
        }

        /// <summary>
        ///Base64解密
        /// </summary>
        /// <returns>返回string</returns>
        public  string base64_decode(string s)
        {
            string strResult = "";

            if ((s != null) && (s != ""))
            {
                int buling = s.Length % 4;
                if (buling == 3) s = s + "=";
                else if (buling == 2) s = s + "==";
                else if (buling == 1) s = s.Substring(0, s.Length - 1);
                strResult = System.Text.ASCIIEncoding.Default.GetString(Convert.FromBase64String(s));
            }

            return strResult;
        }
    }

}
 

附件下載:
白名單過濾插件.7z 9.21KB

標簽: 過濾

相關日誌:

火車采集器偽原創插件V9版

單條記錄下載文件名加自增ID

火車采集器二維碼識別插件,已增加V7版本

jin11顏色尺碼獲取插件

百度相關搜索插件V7版(最後更新2012.11.23)

« 新浪微博評論及轉發數采集插件(C#) | PHPWind 8.0 論壇免登陸發布接口發布»

發表評論:

最後更新:2017-05-09 01:05:54

  上一篇:go 齊博cmsv7.0文章發布模塊發布
  下一篇:go 一個腳本網址的采集辦法(11.24)