【NET】网站数据爬取上

最近因为工作需要，写了个采集网站数据的小程序。
主要功能是首先采集网站列表頁的链接存入数据库（需要排除重复采集的可能），第一次，采集全部，之后只采集最新数据，接着，获取采集到的链接地址，保存页面信息。

程序的主要爬取的数据有两部分，一个是列表的链接，一个是内容页的内容

首先采集网站不可避免的会用到正则表达式，所以就需要在XML中配置每个站点的相关采集信息和正则匹配内容等
XML模板文件如下：

<?xml version="1.0" encoding="utf-8"?>
<Task>
    <!-- 任务名称 -->
    <TaskName>Mandag's Blog</TaskName>
    <!-- 任务网站 -->
    <TaskHost>http://mungyulin.github.io/</TaskHost>
    <!-- 采集模式 列表(LIST)或逐页(PAGE) -->
    <!-- 列表对应PageList -->
    <!-- 逐页对应StartUrl -->
    <CollectModel>LIST</CollectModel>
    <!-- 列表页面编码 -->
    <DefaultEncode>utf-8</DefaultEncode>
    <!-- 请求方式 POST和GET -->
    <RequestMethod>POST</RequestMethod>
    <!-- 是否需要登录 -->
    <HasCookie>false</HasCookie>
    <ListPage>
        <!-- 提取分页的总记录(Groups["totalRecord"])和总页数(Groups["totalPage"]) -->
        <TotalPageRegex></TotalPageRegex>
        <!-- 计算分页的标识 记录计算(Record)和分页获取(Page) -->
        <TotalPageMark>Record</TotalPageMark>
        <!-- 分页数据结果匹配正则 -->
        <ContentResultRegex></ContentResultRegex>
        <!-- 提取采集链接(Groups["url"])和标题(Groups["text"]) -->
        <ListPageUrlRegex></ListPageUrlRegex>
    </ListPage>
    <CheckLogin>
        <!-- 登陆页地址 -->
        <LoginUrl></LoginUrl>
        <!-- 登陆账户提交表单 -->
        <LoginData></LoginData>
        <!-- 登陆成功，采集内容的标识 -->
        <LoginMark></LoginMark>
    </CheckLogin>
    <PageContent>
        <!-- 内容页面编码 -->
        <PageEncode>utf-8</PageEncode>
        <!-- 每次采集上限 大于0时，超过次数，自动停止 -->
        <CollectCountMax>0</CollectCountMax>
        <!-- 是否保存为文件 -->
        <IsSaveAsFile>true</IsSaveAsFile>
        <!-- 是否保存到数据库 -->
        <IsSaveDataBase>true</IsSaveDataBase>
        <!-- 页面标题正则 -->
        <PageTitleRegex></PageTitleRegex>
        <!-- 页面内容正则 -->
        <PageContentRegex></PageContentRegex>
        <!-- 页面其他信息正则 -->
        <PageInfoRegex></PageInfoRegex>
    </PageContent>
    <Item>
        <Title>某某分類</Title>
        <!-- 是否跳过采集 -->
        <NotCollect>false</NotCollect>
        <!-- 重置采集 -->
        <ResetCollect>false</ResetCollect>
        <!-- 采集地址 -->
        <Location></Location>
        <!-- 列表地址 -->
        <PageList>http://mungyulin.github.io/archive?{limit}&amp;{currentPage}</PageList>
        <!-- 地址参数 -->
        <Parameters limit='100' currentPage='1' />
        <!-- 表单数据 -->
        <FormDate></FormDate>
    </Item>
</Task>

配置文件搞定后，就是解析列表页的链接，把每一页的链接取出來，
读取一个配置文件，获取需要采集的站点信息，
首先是获取这个站点列表页的初始页面，可以得到总记录数和总页数

string page = string.Empty;
Regex regex = null;
if (!string.IsNullOrWhiteSpace(linkItem.ListPage.TotalPageRegex))
    regex = new Regex(linkItem.ListPage.TotalPageRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
if (linkItem.CollectModel.Equals("List", StringComparison.OrdinalIgnoreCase))
{
    // 地址栏带起始记录数进行获取列表数据
    if (linkItem.ListPage.TotalPageMark.Equals("Record", StringComparison.OrdinalIgnoreCase))
    {
        if (linkItem.RequestMethod.Equals("Post", StringComparison.OrdinalIgnoreCase))
            page = RequestManager.RequestData(ParaFormat(item.PageList, item.Parameters), item.FormDate, linkItem.DefaultEncode);
        else
            page = RequestManager.GetPage(ParaFormat(item.PageList, item.Parameters), linkItem.DefaultEncode);
    }
    // 地址栏带页数进行获取列表数据
    else if (linkItem.ListPage.TotalPageMark.Equals("Page", StringComparison.OrdinalIgnoreCase))
    {
        if (linkItem.RequestMethod.Equals("Post", StringComparison.OrdinalIgnoreCase))
            page = RequestManager.RequestData(ParaFormat(item.PageList, item.Parameters), item.FormDate, linkItem.DefaultEncode);
        else
            page = RequestManager.GetPage(ParaFormat(item.PageList, item.Parameters), linkItem.DefaultEncode);
    }
    // 使用表单提交获取列表数据
    else if (linkItem.ListPage.TotalPageMark.Equals("Form", StringComparison.OrdinalIgnoreCase))
    {
        if (linkItem.RequestMethod.Equals("Post", StringComparison.OrdinalIgnoreCase))
            page = RequestManager.RequestData(item.PageList, ParaFormat(item.FormDate, item.Parameters), linkItem.DefaultEncode);
    }
}

得到了总记录数和总页数后，就可以模拟翻页，获取列表链接了，
目前网络上的网站列表显示有两种（我只知道两种，不知道的我就不管了╮(╯_╰)╭ ）
一种是静态生成，就是一个页面就是一个HTML文件
还有一种是动态的，比如通过AJAX获取部分数据并刷新局部页面

// 静态页面 只需考虑页面有列表的情况，沒列表就可以跳出來了
string location = item.Location;
int isCount = 1;
while (isCount != 0)
{
    page = RequestManager.GetPage(location, linkItem.DefaultEncode);
    string text = string.Empty;
    if (string.IsNullOrWhiteSpace(page))
    {
        isCount = 0;
        break;
    }
    if (!string.IsNullOrWhiteSpace(linkItem.ListPage.ContentResultRegex))
    {
        var pageRegex = new Regex(linkItem.ListPage.ContentResultRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
        Match pageMatch = pageRegex.Match(page);
        text = pageMatch.Value;
    }
    SetLogInfo(string.Format("正在采集【{0}】-【{1}】 {2}", linkItem.TaskName, item.Title, location), LogType.StayTakeSite);
    var list = UrlManager.UrlParse(text, item.Location, linkItem.ListPage.ListPageUrlRegex);
    if (list != null && list.Any())
    {
        foreach (var url in list)
        {
            // 这里添加到数据库
        }
        location = string.Format(item.StartUrl, isCount);
        isCount++;
    }
    else
        isCount = 0;
}
// 动态页面 一定要有总页数
page = page.Replace("&nbsp;", " ");
Match match = regex.Match(page);
if (match.Groups["totalRecord"].Success)
{
    item.Parameters.TotalRecord = e.Convert.ToInt(match.Groups["totalRecord"].Value);
}
if (match.Groups["totalPage"].Success)
{
    item.Parameters.TotalPage = e.Convert.ToInt(match.Groups["totalPage"].Value);
}
if(match.Groups["totalRecord"].Success && !match.Groups["totalPage"].Success)
{
    // 计算总页数
    if (item.Parameters.Limit != 0x0)
    {
        item.Parameters.TotalPage = item.Parameters.TotalRecord / item.Parameters.Limit;
    }
    else
    {
        SetLogInfo(string.Format("【{0}】-【{1}】{2}", linkItem.TaskName, item.Title, "缺少每页数量的值，无法进行处理！"), LogType.Logger, LogLevelType.Error);
        continue;
    }
}
// 通过总页数模拟翻页
item.Parameters.CurrentPage = 1;
while (item.Parameters.CurrentPage <= item.Parameters.TotalPage)
{
    string location = string.Empty;
    Regex textRegex = null;
    if (!string.IsNullOrWhiteSpace(linkItem.ListPage.ContentResultRegex))
        textRegex = new Regex(linkItem.ListPage.ContentResultRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
    if (linkItem.CollectModel.Equals("List", StringComparison.OrdinalIgnoreCase))
    {
        if (linkItem.ListPage.TotalPageMark.Equals("Record", StringComparison.OrdinalIgnoreCase))
        {
            // 计算数据的起始记录
            item.Parameters.StartNumber = item.Parameters.Limit * (item.Parameters.CurrentPage - 1) + 1;
            item.Parameters.EndNumber = item.Parameters.Limit * item.Parameters.CurrentPage;
            if (item.Parameters.EndNumber > item.Parameters.TotalRecord)
                item.Parameters.EndNumber = item.Parameters.TotalRecord;
            location = ParaFormat(item.PageList, item.Parameters);
        }
        else if (linkItem.ListPage.TotalPageMark.Equals("Page", StringComparison.OrdinalIgnoreCase))
        {
            location = ParaFormat(item.PageList, item.Parameters);
        }
        else
        {
            location = item.PageList;
        }
    }
    if (!string.IsNullOrWhiteSpace(location))
    {
        SetLogInfo(string.Format("正在采集【{0}】-【{1}】共{2}页/第{3}页 {4}", linkItem.TaskName, item.Title, item.Parameters.TotalPage, item.Parameters.CurrentPage, location), LogType.StayTakeSite);
        string text = string.Empty;
        // 配置文件中，未配置表单数据
        // 该站点则为GET进行获取
        // 否则为POST提交表单数据进行获取
        if (linkItem.RequestMethod.Equals("Post", StringComparison.OrdinalIgnoreCase))
        {
            if (linkItem.ListPage.TotalPageMark.Equals("Record", StringComparison.OrdinalIgnoreCase))
            {
                text = RequestManager.RequestData(location, item.FormDate, linkItem.DefaultEncode);
            }
            else
            {
                text = RequestManager.RequestData(location, ParaFormat(item.FormDate, item.Parameters), linkItem.DefaultEncode);
            }
        }
        else
        {
            text = RequestManager.GetPage(location, linkItem.DefaultEncode);
        }
        if (!string.IsNullOrWhiteSpace(text))
        {
            // 得到筛选A标签的部分文本内容
            if (textRegex != null)
            {
                MatchCollection textMatch = textRegex.Matches(text);
                text = string.Empty;
                foreach (Match mc in textMatch)
                {
                    text += mc.Value;
                }
            }
            // 得到待采集站点列表
            var list = UrlManager.UrlParse(text, item.Location, linkItem.ListPage.ListPageUrlRegex);
            if (list != null && list.Any())
            {
                foreach (var url in list)
                {
                    //這裏添加到数据库
                }
            }
            else
            {
                SetLogInfo(string.Format("【{0}】-【{1}】共{2}页/第{3}页 未提取到a标签！", linkItem.TaskName, item.Title, item.Parameters.TotalPage, item.Parameters.CurrentPage), LogType.Logger, LogLevelType.Warn);
            }
        }
        else
        {
            SetLogInfo(string.Format("【{0}】-【{1}】共{2}页/第{3}页 未读取到数据，无法进行数据采集！", linkItem.TaskName, item.Title, item.Parameters.TotalPage, item.Parameters.CurrentPage), LogType.Logger, LogLevelType.Error);
            continue;
        }
    }
    else
    {
        SetLogInfo(string.Format("正在采集【{0}】-【{1}】共{2}页/第{3}页 采集地址未定义！", linkItem.TaskName, item.Title, item.Parameters.TotalPage, item.Parameters.CurrentPage), LogType.Logger, LogLevelType.Warn);
        continue;
    }
    if (item.Parameters.CurrentPage != int.MaxValue)
        item.Parameters.CurrentPage++;
}

地址采集的OK了，内容页的明天贴。