C# : 網路爬蟲 (Crawler) 的設計

基礎篇

C# 簡介

開發環境

變數與運算

流程控制

陣列

函數

物件

例外處理

函式庫篇

檔案處理

資料結構

正規表達式

Thread

應用篇

視窗程式

媒體影音

網路程式

遊戲程式

手機程式

資料庫

雲端運算

特殊功能

委派

擴展方法

序列化

LinQ

WPF

網路資源

教學影片

投影片

教學文章

軟體下載

考題解答

101習題

簡介

原始程式

using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;

class WebCrawler
{
//    WebProxy proxy = new WebProxy("http://proxy.internal:3128/", true);
    List<String> urlList = new List<String>();
    // Dictionary<String, String> 

    public static void Main(String[] args)
    {
        WebCrawler crawler = new WebCrawler();
        crawler.urlList.Add("http://tw.msn.com/");
        crawler.craw();
    }

    public void craw()
    {
        int urlIdx = 0;
        while (urlIdx < urlList.Count)
        {
            try
            {
                String url = urlList[urlIdx];
                String fileName = "data/" + toFileName(url);
                Console.WriteLine(urlIdx + ":url=" + url + " file=" + fileName);
                urlToFile(url, fileName);
                String html = fileToText(fileName);
                foreach (String childUrl in matches("\\shref\\s*=\\s*\"(.*?)\"", html, 1))
                {
                    Console.WriteLine(childUrl);
                    urlList.Add(childUrl);
                }
            }
            catch
            {
                Console.WriteLine("Error:" + urlList[urlIdx] + " fail!");
            }
            urlIdx++;
        }
    }

    public static IEnumerable matches(String pPattern, String pText, int pGroupId)
    {
        Regex r = new Regex(pPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
        for (Match m = r.Match(pText); m.Success; m = m.NextMatch())
         yield return m.Groups[pGroupId].Value;
    }

    public static String fileToText(String filePath)
    {
        StreamReader file = new StreamReader(filePath);
        String text = file.ReadToEnd();
        file.Close();
        return text;
    }

    public void urlToFile(String url, String file)
    {
        WebClient webclient = new WebClient();
//        webclient.Proxy = proxy;
        webclient.DownloadFile(url, file);
    }

    public static String toFileName(String url)
    {
        String fileName = url.Replace('?', '_');
        fileName = fileName.Replace('/', '_');
        fileName = fileName.Replace('&', '_');
        fileName = fileName.Replace(':', '_');
        fileName = fileName.ToLower();
        if (!fileName.EndsWith(".htm") && !fileName.EndsWith(".html"))
         fileName = fileName + ".htm";
        return fileName;
    }
}
Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-ShareAlike 3.0 License