nodejs 小爬虫

编写爬虫示例:html

var http = require('http');
var cheerio = require('cheerio');

var url = 'http://www.cnblogs.com/tianxintian22/';

function filterblogs(html) {
    var $ = cheerio.load(html);

    var blogs = $('.day');

    // [{
    //     dayTitle: '',
    //    dayCont: {
    //        postId: '',
    //         postTitle: '',
    //         postCont: ''
    //    }
    // }]

    var blogDatas = [];

    blogs.each(function(item) {
        var blog = $(this);
        var dayTitle = blog.find('.dayTitle a').text();
        var blogData = {
            dayTitle: dayTitle,
            dayCont: []
        };

        var postId = blog.find('.postCon .c_b_p_desc a').attr('href').split('p/')[1].replace('.html', '');
        var postTitle = blog.find('.postTitle a').text();
        var postCont = blog.find('.postCon .c_b_p_desc').text();

        blogData.dayCont.push({
            postId: postId,
            postTitle: postTitle,
            postCont: postCont
        });

        blogDatas.push(blogData);
    })

    return blogDatas;

}

function printBlogInfo(blogDatas) {
    blogDatas.forEach(function(item) {
        var dayTitle = item.dayTitle;
        console.log(dayTitle + '\n');

        item.dayCont.forEach(function(blog){
            console.log('    【' + blog.postId + '】' + blog.postTitle +'\n');
            console.log('    ' + blog.postCont + '\n');
        });
    })
}

http.get(url, function (res) {
    var html = '';

    res.on('data', function(data) {
        html += data;
    });

    res.on('end', function() {
        var blogDatas =  filterblogs(html);
        printBlogInfo(blogDatas);
    });
}).on('error', function() {
    console.log('获取博客数据出错');
})