第一句子网 - 唯美句子、句子迷、好句子大全
第一句子网 > python抓取网页中图片并保存到本地

python抓取网页中图片并保存到本地

时间:2023-04-05 10:12:43

相关推荐

python抓取网页中图片并保存到本地

后端开发|Python教程

python抓取网页图片,python抓取网页数据,python抓取网页

后端开发-Python教程

在上篇文章给大家分享PHP源码批量抓取远程网页图片并保存到本地的实现方法,感兴趣的朋友可以点击了解详情。

单页产品网站源码带后台,ubuntu安装软件qq,tomcat解压版内存设置,adc防爬虫,php上传图片保存,seo营销战略seo博客lzw

#-*-coding:utf-8-*- import osimport uuidimport urllib2import cookielib\获取文件后缀名\def get_file_extension(file): return os.path.splitext(file)[1] \創建文件目录,并返回该目录\def mkdir(path): # 去除左右两边的空格 path=path.strip() # 去除尾部 \符号 path=path.rstrip("\\") if not os.path.exists(path): os.makedirs(path) return path\自动生成一个唯一的字符串,固定长度为36\def unique_str(): return str(uuid.uuid1())\抓取网页文件内容,保存到内存@url 欲抓取文件 ,path+filename\def get_file(url): try: cj=cookielib.LWPCookieJar() opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) req=urllib2.Request(url) operate=opener.open(req) data=operate.read() return data except BaseException, e: print e return None\保存文件到本地@path 本地路径@file_name 文件名@data 文件内容\def save_file(path, file_name, data): if data == None: return mkdir(path) if(not path.endswith("/")): path=path+"/" file=open(path+file_name, "wb") file.write(data) file.flush() file.close()#获取文件后缀名print get_file_extension("123.jpg");#創建文件目录,并返回该目录#print mkdir("d:/ljq")#自动生成一个唯一的字符串,固定长度为36print unique_str()url="http://qlogo1./qzone/416501600/416501600/100?0";save_file("d:/ljq/", "123.jpg", get_file(url))

易语言股票行情源码,ubuntu gcc命令,tomcat中间件优点,小狗爬虫,php程序员简历怎么写,淮北推广抖音seo优化哪家好lzw

通过Python抓取指定Url中的图片保存至本地

个人赚钱视频网站源码,ubuntu系统内核替换,python爬虫钉钉,分段加密php,浦东seo价格lzw

# *** encoding: utf-8 ***__author__=jiangyt\""" fetch images from specific urlv1.0""" import urllib, httplib, urlparse import re import random """judge url exists or not""" def httpExists(url): host, path = urlparse.urlsplit(url)[1:3] if : in host:# port specified, try to use ithost, port = host.split(:, 1)try: port = int(port)except ValueError: print invalid port number %r % (port,) return False else:# no port specified, use default portport = None try:connection = httplib.HTTPConnection(host, port=port)connection.request("HEAD", path)resp = connection.getresponse( )if resp.status == 200: # normal found status found = Trueelif resp.status == 302: # recurse on temporary redirect found = httpExists(urlparse.urljoin(url,resp.getheader(location, \)))else: # everything else -> not found print "Status %d %s : %s" % (resp.status, resp.reason, url) found = False except Exception, e:print e.__class__, e, urlfound = False return found """get html src,return lines[]""" def gGetHtmlLines(url): if url==None : return if not httpExists(url): return try:page = urllib.urlopen(url)html = page.readlines()page.close()return html except Exception, e:print "gGetHtmlLines() error! Exception ==>>" + ereturn """get html src,return string""" def gGetHtml(url): if url==None : return if not httpExists(url): return try:page = urllib.urlopen(url)html = page.read()page.close()return html except Exception, e:print "gGetHtml() error! Exception ==>>" + ereturn """根据url获取文件名""" def gGetFileName(url): if url==None: return None if url=="" : return "" arr=url.split("/") return arr[len(arr)-1] """生成随机文件名""" def gRandFilename(type): fname = \ for i in range(16):fname = fname + chr(random.randint(65,90))fname = fname + chr(random.randint(48,57)) return fname + . + type """根据url和其上的link,得到link的绝对地址""" def gGetAbslLink(url,link): if url==None or link == None : return if url==\ or link==\ : return url addr = \ if link[0] == / :addr = gGetHttpAddr(url) + link elif len(link)>3 and link[0:4] == http:addr = link elif len(link)>2 and link[0:2] == ..:addr = gGetHttpAddrFatherAssign(url,link) else:addr = gGetHttpAddrFather(url) + link return addr """根据输入的lines,匹配正则表达式,返回list""" def gGetRegList(linesList,regx): if linesList==None : return rtnList=[] for line in linesList:matchs = re.search(regx, line, re.IGNORECASE)if matchs!=None: allGroups = matchs.groups() for foundStr in allGroups: if foundStr not in rtnList: rtnList.append(foundStr) return rtnList """根据url下载文件,文件名参数指定""" def gDownloadWithFilename(url,savePath,file): #参数检查,现忽略 try:urlopen=urllib.URLopener()fp = urlopen.open(url)data = fp.read()fp.close()file=open(savePath + file,w+b)file.write(data)file.close() except IOError, error:print "DOWNLOAD %s ERROR!==>>%s" % (url, error) except Exception, e:print "Exception==>>" + e """根据url下载文件,文件名自动从url获取""" def gDownload(url,savePath): #参数检查,现忽略 fileName = gGetFileName(url) #fileName =gRandFilename(jpg) gDownloadWithFilename(url,savePath,fileName) """根据某网页的url,下载该网页的jpg""" def gDownloadHtmlJpg(downloadUrl,savePath): lines= gGetHtmlLines(downloadUrl) # get the page source regx = r"""src\s*="?(\S+)\.jpg""" lists =gGetRegList(lines,regx) #get the links which match regular express if lists==None: return for jpg in lists:jpg = gGetAbslLink(downloadUrl, jpg) + .jpggDownload(jpg,savePath)print gGetFileName(jpg) """根据url取主站地址""" def gGetHttpAddr(url): if url== \ : return \ arr=url.split("/") return arr[0]+"//"+arr[2] """根据url取上级目录""" def gGetHttpAddrFather(url): if url==\ : return \ arr=url.split("/") addr = arr[0]+//+arr[2]+ / if len(arr)-1>3 :for i in range(3,len(arr)-1): addr = addr + arr[i] + / return addr """根据url和上级的link取link的绝对地址""" def gGetHttpAddrFatherAssign(url,link): if url==\ : return \ if link==\: return \ linkArray=link.split("/") urlArray = url.split("/") partLink =\ partUrl = \ for i in range(len(linkArray)):if linkArray[i]==..: numOfFather = i + 1 #上级数else: partLink = partLink + / + linkArray[i] for i in range(len(urlArray)-1-numOfFather):partUrl = partUrl + urlArray[i]if i < len(urlArray)-1-numOfFather -1 : partUrl = partUrl + / return partUrl + partLink """根据url获取其上的相关htm、html链接,返回list""" def gGetHtmlLink(url): #参数检查,现忽略 rtnList=[] lines=gGetHtmlLines(url) regx = r"""href="?(\S+)\.htm""" for link in gGetRegList(lines,regx):link = gGetAbslLink(url,link) + .htmif link not in rtnList: rtnList.append(link) print link return rtnList """根据url,抓取其上的jpg和其链接htm上的jpg""" def gDownloadAllJpg(url,savePath): #参数检查,现忽略 gDownloadHtmlJpg(url,savePath) #抓取link上的jpg links=gGetHtmlLink(url) for link in links:gDownloadHtmlJpg(link,savePath) """test""" def main(): u=/196738/room/2462453/#想要抓取图片的地址 save=/root/python/tmp/ #图片所要存放的目录 print download pic from [ + u +] print save to [ +save+] ... gDownloadHtmlJpg(u,save) print "download finished" if __name__ == "__main__": main()else: print "called from intern."

以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容,希望大家喜欢。

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。