通过python采集时 ,经常需要从html 中获取图片或文件的URL并下载到本地,这里列举最常用的三种模块下载的方法:urllib模块、urllib2模块、requests模块。具体代码如下:
import urllib import urllib2 import requests url = 'http://www.test.com/wp-content/uploads/2012/06/wxDbViewer.zip' print "downloading with urllib" urllib.urlretrieve(url, "code.zip") print "downloading with urllib2" f = urllib2.urlopen(url) data = f.read() with open("code2.zip", "wb") as code: code.write(data) print "downloading with requests" r = requests.get(url) with open("code3.zip", "wb") as code: code.write(r.content)
看起来使用urllib最为简单,一句语句即可。当然你可以把urllib2缩写成:
f = urllib2.urlopen(url) with open("code2.zip", "wb") as code: code.write(f.read())
上面的方法中,还可以设置timeout参数,避免采集一直阻塞。除上面的介绍外,还可以使用pycurl 模块进行下载文件。
import pycurl import StringIO ##### init the env ########### c = pycurl.Curl() c.setopt(pycurl.COOKIEFILE, "cookie_file_name")#把cookie保存在该文件中 c.setopt(pycurl.COOKIEJAR, "cookie_file_name") c.setopt(pycurl.FOLLOWLOCATION, 1) #允许跟踪来源 c.setopt(pycurl.MAXREDIRS, 5) #设置代理 如果有需要请去掉注释,并设置合适的参数 #c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080') #c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa') ########### get the data && save to file ########### head = ['Accept:*/*','User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0'] buf = StringIO.StringIO() curl.setopt(pycurl.WRITEFUNCTION, buf.write) curl.setopt(pycurl.URL, url) curl.setopt(pycurl.HTTPHEADER, head) curl.perform() the_page =buf.getvalue() buf.close() f = open("./%s" % ("img_filename",), 'wb') f.write(the_page) f.close()