python3.7爬取新浪博客所有文章存入word
本文使用python抓取新浪博客全部文章的源码基于网上旧版教程,因网上同类文章重复过多,很多站点的python爬取教程都是一模一样,该教程的原始出处已无法考证。现在网上流传的教程基本已经无法在python3.7版本下运行了,需要进行不少修改,这次的代码写的比较仓促,有很多累赘代码没有时间删除优化。另外在使用这个python爬取的过程中发现对表格的支持不友好,另外不支持爬取图片。爬取图片的教程网上有很多,也可以参考我之前的文章:python3.7保存防盗链图片referer
本代码基本保留了网上流传老版本的所有代码,并已针对python3.7环境做出相应的修改(同时也保留了老版本大代码,但是注释掉了否则无法运行)
import urllib #import urllib2 import urllib.request import re #from urllib import urlopen from urllib.request import urlopen class Tool: #去除img标签,7位长空格 removeImg = re.compile('<img.*?>| {7}|') #删除超链接标签 removeAddr = re.compile('<a.*?>|</a>') #把换行的标签换为\n replaceLine = re.compile('<tr>|<div>|</div>|</p>') #将表格制表<td>替换为\t replaceTD= re.compile('<td>') #把段落开头换为\n加空两格 replacePara = re.compile('<p.*?>') #将换行符或双换行符替换为\n replaceBR = re.compile('<br><br>|<br>') #将其余标签剔除 removeExtraTag = re.compile('<.*?>') def replace(self,x): x = re.sub(self.removeImg,"",x) x = re.sub(self.removeAddr,"",x) x = re.sub(self.replaceLine,"\n",x) x = re.sub(self.replaceTD,"\t",x) x = re.sub(self.replacePara,"\n ",x) x = re.sub(self.replaceBR,"\n",x) x = re.sub(self.removeExtraTag,"",x) #strip()将前后多余内容删除 return x.strip() class XLBK: def __init__(self,baseUrl,articleTag,fileName): self.baseURL=baseUrl self.tool=Tool() self.file=None self.article=1 self.defaultTitle=u'新浪博客' self.articleTag=articleTag self.fileName=fileName def getPage(self,pageNum): try: url=self.baseURL+str(pageNum)+'.html' print ('即将抓取的网页:'+url) #request= urllib2.Request(url) request= urllib.request.Request(url) #response=urllib2.urlopen(request) response=urllib.request.urlopen(request) return response.read().decode('utf-8') #except urllib2.URLError ,e: except urllib.error.URLError as e: if hasattr(e,"reason"): #print u"连接新浪博客失败,错误原因",e.reason print ("连接新浪博客失败,错误原因",e.reason) return None def getTitle(self,page): pattern = re.compile('blogname.*?blognamespan.*?>(.*?)</span>', re.S) result = re.search(pattern,page) print ("title"+result.group(1).strip()) if result: return result.group(1).strip() else: return None def getPageNum(self,page): #pattern= re.compile(ur'<span style.*?>共(.*?)页</span>',re.S) pattern= re.compile(u'<span style.*?>共(.*?)页</span>',re.S) result = re.search(pattern,page) if result: #print "pagenum"+result.group(1).strip() return result.group(1).strip() else: print (result) return 1 def getContent(self,page): pattern = re.compile('<span class="atc_title">.*?<a.*?href.*?.html">(.*?)</a>.*?</span>',re.S) items = re.findall(pattern,page) contents = [] for item in items: content = "\n"+self.tool.replace(item)+"\n" contents.append(content.encode('utf-8')) #print (content)# 返回博客标题了 return contents def getUrl(self,page): pattern =re.compile('<span class="atc_title">.*?<a.*?href="(.*?)">.*?</a>.*?</span>',re.S) items = re.findall(pattern,page) urls = [] for item in items: url = item #urls.append(url.encode('utf-8')) #会导致首部多出字符 b' urls.append(url) #print (url) #返回博客url return urls def getText(self,url): text=urlopen(url).read().decode('utf-8') start=text.find(u"<!-- 正文开始 -->") print (start) end=text.find(u"<!-- 正文结束 -->") print (end) text=text[start:end] text = re.sub(re.compile('<p.*?>'),"\n ",text) text = re.sub(re.compile('<p>'),"\n ",text) text=re.sub(r'<(S*?)[^>]*>.*?|<.*? /> ','',text) text=re.sub(r'&[^>]*?\;',' ',text) #return text.encode('utf-8') #编码混乱 return text def setFileTitle(self,title): if title is not None: self.file = open(title + ".doc","w") else: self.file = open(self.defaultTitle + ".doc","w") def writeData_original(self,contents,urls): for item in contents: #print('item=='+str(item)) if self.articleTag == '1': articleLine = "\n" + str(self.article) + u"--------------------------------------------------------------------------------\n" self.file.write(articleLine) #self.file.write(item) #TypeError: write() argument must be str, not bytes self.file.write(str(item)) #print item #self.file.write(urls[contents.index(item)]) #TypeError: write() argument must be str, not bytes self.file.write(urls[contents.index(str(item))]) #print urls[contents.index(item)] text=self.getText(urls[contents.index(item)]) print (text) self.file.write(str(text)) self.article += 1 def writeData(self,contents,urls): print(urls) for item in urls: #item ='http://blog.sina.com.cn/s/blog_67493dad0102uxwl.html' print('item=='+str(item)) if self.articleTag == '1': articleLine = "\n" + str(self.article) + u"--------------------------------------------------------------------------------\n" self.file.write(articleLine) #self.file.write(item) #TypeError: write() argument must be str, not bytes self.file.write(str(item)) #print item #self.file.write(urls[contents.index(item)]) #TypeError: write() argument must be str, not bytes #self.file.write(urls[contents.index(str(item))]) item=str(item) item=item.replace("b'", ""); #self.file.write(contents[urls.index(item)]) #print urls[contents.index(item)] #text=self.getText(urls[contents.index(item)]) text=self.getText(item) #text=self.getText('http://blog.sina.com.cn/s/blog_67493dad0102uxwl.html') print (text) self.file.write(str(text)) self.article += 1 def start(self): indexPage = self.getPage(1) pageNum = self.getPageNum(indexPage) title = self.getTitle(indexPage) self.setFileTitle(self.fileName) if pageNum == None: print ("URL已失效,请重试") return try: print ("该博客共有" + str(pageNum) + "页") for i in range(1,int(pageNum)+1): print ("正在写入第" + str(i) + "页数据") page = self.getPage(i) contents = self.getContent(page) urls =self.getUrl(page) self.writeData(contents,urls) #except IOError,e: except IOError as e: print ("写入异常,原因" + e.message) finally: print ("写入任务完成") #print u"打开一个新浪博客的博文目录\n如http://blog.sina.com.cn/s/articlelist_1866629225_0_1.html \n那么该博客的代号为1866629225_0_ \n请输入博客代号" #baseURL = 'http://blog.sina.com.cn/s/articlelist_' + str(raw_input("")) #baseURL = 'http://blog.sina.com.cn/s/articlelist_' + str(input("")) # #假设本例要测试的新浪博客全部文章目录地址为: http://blog.sina.com.cn/s/articlelist_1732853165_0_1.html baseURL = 'http://blog.sina.com.cn/s/articlelist_1732853165_0_' #这里是直接赋值的形式 #articleTag = raw_input("是否写入文章编号信息,是输入1,否输入0\n") articleTag = input("是否写入文章编号信息,是输入1,否输入0\n") # 运行后输入 1 #fileName=raw_input("请输入保存文档的名称\n") fileName=input("请输入保存文档的名称\n") #文件名可以随便输入-------也可以直接赋值 fileName=sina_blog_save xlbk = XLBK(baseURL,articleTag,fileName) xlbk.start()
测试的时候爬取新浪博客文章的速度还可以,就是爬取的内容还有进一步优化的空间,后面有时间再优化吧。不得不说python就是强大,各种库不是吹的。
基于互联网精神,在注明出处的前提下本站文章可自由转载!
本文链接:https://ranjuan.cn/python-crawler-sinablog/
赞赏
微信赞赏支付宝赞赏
发表评论