三个多线程采集网站图片的示例

2014年10月3日 发表评论 阅读评论

采集代码一

#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os ,sys ,urllib2,socket
import re
import time
from threading import Thread
from Queue import Queue
DOWNLOAD_BASEDIR = os.path.join(os.path.dirname(__file__), 'download') #保存地址
socket.setdefaulttimeout(30)
THREAD_COUNT = 5 #线程数量
def md5sum(s):
    try:
        import hashlib
        m = hashlib.md5()
        m.update(s)
        return m.hexdigest()
    except:
        import md5
        m = md5.new()
        m.update(s)
        return m.hexdigest()
class spiderList(Thread):
    def __init__(self ,queue):
        Thread.__init__(self)
        self.queue = queue
    def run(self):
        pages = []
        #这个网站列表的页数从第1页到第117页
        for i in range(1,117):
            pages.append('http://xxx.com/?page=%s' % i)
        self.queue.put(pages)
        self.queue.task_done()
class spiderDetail(Thread):
    def __init__(self,queue):
        Thread.__init__(self)
        self.queue = queue
        self.header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2'
        }
    def run(self):
        urls = self.queue.get()
        self.page=1
        for url in urls:
            rq = urllib2.urlopen(urllib2.Request(url = url ,headers = self.header))
            result = re.findall('_src="([wW]+?)"', rq.read())
            if result != '':
                for src in result:
                    bigImage = self.__getBigImage(src)
                    if bigImage!='':
                        img = urllib2.urlopen(bigImage).read()
                        fileName = self.__getFileName(bigImage)
                        file(fileName,'wb').write(img)
            self.page+=1
        self.queue.task_done()
    def __getDir(self):
        import datetime
        now = datetime.datetime.now()
        dateDir = now.strftime('%Y-%m-%d')
        saveDir = os.path.join(DOWNLOAD_BASEDIR, dateDir)
        pageDir = 'page_%d' % self.page
        saveDir = os.path.join(saveDir, pageDir)
        if os.path.isdir(saveDir) == False:
            os.makedirs(saveDir)
        return saveDir
    def __getBigImage(self ,url):
        if(url==''):
            return False
        args = re.split("-([0-9a-zA-z]+).", url)
        return args[0]+'.'+args[2]
    def __getFileName(self,url):
        baseName = os.path.basename(bigImage)
        args = os.path.splitext(baseName)
        fileName = md5sum(args[0])+args[1]
        return os.path.join(self.__getDir(), fileName)
if __name__ == '__main__':
    queue = Queue()
    for i in range(THREAD_COUNT):
        lt = spiderList(queue)
        lt.setDaemon(True)
        lt.start()
        dt = spiderDetail(queue)
        dt.setDaemon(True)
        dt.start()
    while 1:
        pass

采集代码二

#!/usr/bin/env python
#-*- coding: utf-8 -*-
#通过urllib(2)模块下载网络内容
import urllib,urllib2,gevent
#引入正则表达式模块,时间模块
import re,time
from gevent import monkey
monkey.patch_all()
def geturllist(url):
    url_list=[]
    print url
    s = urllib2.urlopen(url)
    text = s.read()
    #正则匹配,匹配其中的图片
    html = re.search(r'<ol.*</ol>', text, re.S)
    urls = re.finditer(r'<p><img src="(.+?)jpg" /></p>',html.group(),re.I)
    for i in urls:
        url=i.group(1).strip()+str("jpg")
        url_list.append(url)
    return url_list
def download(down_url):
    name=str(time.time())[:-3]+"_"+re.sub('.+?/','',down_url)
    print name
    urllib.urlretrieve(down_url, "D:\TEMP\"+name)
def getpageurl():
    page_list = []
    #进行列表页循环
    for page in range(1,700):
        url="http://jandan.net/ooxx/page-"+str(page)+"#comments"
        #把生成的url加入到page_list中
        page_list.append(url)
    print page_list
    return page_list
if __name__ == '__main__':
    jobs = []
    pageurl = getpageurl()[::-1]
    #进行图片下载
    for i in pageurl:
        for (downurl) in geturllist(i):
            jobs.append(gevent.spawn(download, downurl))
    gevent.joinall(jobs)

采集代码三

import os,time,sys,re,threading
import urllib
DOWNLOAD_BASEDIR = os.path.join(os.path.dirname(__file__), 'download')
DOWNLOAD_BASEURL = './download/'
os.mkdir(DOWNLOAD_BASEDIR)
def md5sum(s):
    try:
        import hashlib
        m = hashlib.md5()
        m.update(s)
        return m.hexdigest()
    except:
        import md5
        m = md5.new()
        m.update(s)
        return m.hexdigest()
class Download(threading.Thread):
    def __init__(self, url):
        threading.Thread.__init__(self)
        self.url = url
    def run(self):
##        print "downloading %s " % self.url
        f = urllib.urlopen(self.url)
        content_type,extention = f.headers.get('content-type','image/jpeg').split('/')
        if extention in ('jpeg','html'):
            extention = 'jpg'
        basename = "%s.%s" %( md5sum(self.url) , extention)
        self.filename = os.path.join(DOWNLOAD_BASEDIR, basename)
        self.local_url = DOWNLOAD_BASEURL + basename
        file(self.filename, 'wb').write(f.read())
content = file(os.path.join(os.path.dirname(__file__), 'content.html')).read()
pt=re.compile(r"""src=['"]?(http://.*?)[ '"]""")
urls = []
for url in pt.findall(content):
    urls.append(url)
print time.ctime()
thread_pools = []
for url in urls:
    current = Download(url)
    thread_pools.append(current)
    current.start()
result_text = content
for result in thread_pools:
    print "%s threads running" % threading.activeCount()
    result.join(5)
    if not result.isAlive():
##        print "url %s saved to %s" % (result.url, result.filename)
        result_text = result_text.replace(result.url, result.local_url)
file(os.path.join(os.path.dirname(__file__), 'result.html'), 'wb').write(result_text)
print "%s threads running" % threading.activeCount()
if threading.activeCount():
    print "Can not stop"
print time.ctime()




本站的发展离不开您的资助,金额随意,欢迎来赏!

You can donate through PayPal.
My paypal id: itybku@139.com
Paypal page: https://www.paypal.me/361way

  1. 本文目前尚无任何评论.
  1. 本文目前尚无任何 trackbacks 和 pingbacks.