【Python】Python简单数据爬取

这段时间再看《黑客軍團》,就稍稍的翻了下 知道创宇研发技能表,太复杂,不过还是学习了下Python,

恩,并且学习着写了个简单的抓取网站图片的小程序。

爬取的网站是:http://www.22mm.cc/


以下使用的是Python2.7版本。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# 程序主要入口
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import url_manager, pic_downloader, pic_outputer, pic_parser
class SpiderMain:
def __init__(self):
self.urls = url_manager.UrlManager()
self.parser = pic_parser.HtmlParser()
self.outputer = pic_outputer.HtmlOutputer()
def getPage(self, url):
if url is None:
return None
response = urllib2.urlopen(url)
if response.getcode() != 200:
return None
return response.read()
def craw(self, root_url):
self.urls.add_new_url(root_url)
while self.urls.has_new_url():
try:
new_url = self.urls.get_new_url()
html_cont = self.getPage(new_url)
new_urls, new_data = self.parser.parse(new_url, html_cont)
self.urls.add_new_urls(new_urls)
self.outputer.collect_data(new_data)
except:
print 'craw failed'
self.outputer.output_html()
if __name__ == "__main__":
root_url = "http://www.22mm.cc/"
obj_spider = SpiderMain()
obj_spider.craw(root_url)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# Url管理器
#!/usr/bin/env python2
# -*- coding: UTF-8 -*-
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def add_new_url(self, url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls) != 0
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# Html解析器
#!/usr/bin/env python2
# -*- coding: UTF-8 -*-
import re, urlparse
from bs4 import BeautifulSoup
class HtmlParser(object):
def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
new_urls = self._get_new_urls(page_url, soup)
new_data = self._get_new_data(page_url, soup)
return new_urls, new_data
def _get_new_urls(self, page_url, soup):
new_urls = set()
links = soup.find_all('a', href = re.compile(r"/mm/(.*?)/(.*?).html"))
for link in links:
new_url = link['href']
new_full_url = new_url
if "http://" not in new_url:
new_full_url = urlparse.urljoin(page_url, new_url)
new_urls.add(new_full_url)
return new_urls
def _get_new_data(self, page_url, soup):
res_data = {}
new_str_url = page_url.replace('/','').replace('http:','').replace('https:','')
res_data['url'] = new_str_url
res_data['img'] = {}
image_list = soup.find_all('img', src = re.compile(r"(.*?)\.jpg"))
for item in image_list:
res_data['img'][item['src']] = item['src'].split('/')[-1]
return res_data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# HtmL生成器
#!/usr/bin/env python2
# -*- coding: UTF-8 -*-
import sys, os, urllib
class HtmlOutputer(object):
def __init__(self):
reload(sys)
sys.setdefaultencoding('utf8')
self.datas = []
def saveImg(self, imageURL, fileName):
imageData = urllib.urlopen(imageURL).read()
f = open(fileName, 'wb')
f.write(imageData)
print u"正在保存图片:", fileName
f.close()
def mkdir(self, path):
path = path.strip()
isExists = os.path.exists(path)
if not isExists:
print u"新建了", path, u'文件夹'
os.makedirs(path)
return True
else:
print u"名为", path, '的文件夹已经创建成功'
return False
def collect_data(self, data):
if data is None:
return
self.datas.append(data)
def output_html(self):
fout = open('output.html', 'w')
fout.write("<html>")
fout.write("<body>")
fout.write("<table border='1' cellspacing='0' bordercolor='#666666' style='border-collapse: collapse;'>")
fout.write("<tr><th>Url</th><th>Images</th><th>Name</th></tr>")
for data in self.datas:
for key, value in data['img'].items():
fout.write("<tr>")
fout.write("<td>%s</td>" % data['url'])
fout.write("<td>")
fout.write("<img src='%s'>" % key.encode('utf-8'))
fout.write("</td>")
fout.write("<td>%s</td>" % value.encode('utf-8'))
fout.write("</tr>")
self.mkdir(data['url'])
self.saveImg(key, data['url'] + u"/" + value)
fout.write("</table>")
fout.write("</body>")
fout.write("</html>")
fout.close()

参考网站:
Python 2.7教程
Python 爬虫学习系列教程
Python开发简单爬虫