python3.7爬取新浪博客所有文章存入word

本文使用python抓取新浪博客全部文章的源码基于网上旧版教程,因网上同类文章重复过多,很多站点的python爬取教程都是一模一样,该教程的原始出处已无法考证。现在网上流传的教程基本已经无法在python3.7版本下运行了,需要进行不少修改,这次的代码写的比较仓促,有很多累赘代码没有时间删除优化。另外在使用这个python爬取的过程中发现对表格的支持不友好,另外不支持爬取图片。爬取图片的教程网上有很多,也可以参考我之前的文章:python3.7保存防盗链图片referer

本代码基本保留了网上流传老版本的所有代码,并已针对python3.7环境做出相应的修改(同时也保留了老版本大代码,但是注释掉了否则无法运行)

import urllib
#import urllib2
import urllib.request
import re
#from urllib import urlopen
from urllib.request import urlopen
class Tool:
     #去除img标签,7位长空格
    removeImg = re.compile('<img.*?>| {7}|')
    #删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    #把换行的标签换为\n
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    #将表格制表<td>替换为\t
    replaceTD= re.compile('<td>')
    #把段落开头换为\n加空两格
    replacePara = re.compile('<p.*?>')
    #将换行符或双换行符替换为\n
    replaceBR = re.compile('<br><br>|<br>')
    #将其余标签剔除
    removeExtraTag = re.compile('<.*?>')
    def replace(self,x):
        x = re.sub(self.removeImg,"",x)
        x = re.sub(self.removeAddr,"",x)
        x = re.sub(self.replaceLine,"\n",x)
        x = re.sub(self.replaceTD,"\t",x)
        x = re.sub(self.replacePara,"\n    ",x)
        x = re.sub(self.replaceBR,"\n",x)
        x = re.sub(self.removeExtraTag,"",x)
        #strip()将前后多余内容删除
        return x.strip()
 
class XLBK:
    def __init__(self,baseUrl,articleTag,fileName):
        self.baseURL=baseUrl
        self.tool=Tool()
        self.file=None
        self.article=1
        self.defaultTitle=u'新浪博客'
        self.articleTag=articleTag
        self.fileName=fileName
 
    def getPage(self,pageNum):
        try:
            url=self.baseURL+str(pageNum)+'.html'
            print ('即将抓取的网页:'+url)
            #request= urllib2.Request(url)
            request= urllib.request.Request(url)
            #response=urllib2.urlopen(request)
            response=urllib.request.urlopen(request)
            return response.read().decode('utf-8')
 
        #except urllib2.URLError ,e:
        except urllib.error.URLError as e:
            if hasattr(e,"reason"):
                #print u"连接新浪博客失败,错误原因",e.reason
                print ("连接新浪博客失败,错误原因",e.reason)

                return None
    def getTitle(self,page):
        pattern = re.compile('blogname.*?blognamespan.*?>(.*?)</span>', re.S)
        result = re.search(pattern,page)
        print ("title"+result.group(1).strip())
        if result:
            return result.group(1).strip()
        else:
            return None
 
    def getPageNum(self,page):
        #pattern= re.compile(ur'<span style.*?>共(.*?)页</span>',re.S)
        pattern= re.compile(u'<span style.*?>共(.*?)页</span>',re.S)
        result = re.search(pattern,page)
        if result:
            #print "pagenum"+result.group(1).strip()
            return result.group(1).strip()
        else:
            print (result)
            return 1
 
    def getContent(self,page):
        pattern = re.compile('<span class="atc_title">.*?<a.*?href.*?.html">(.*?)</a>.*?</span>',re.S)
        items = re.findall(pattern,page)
        contents = []
        for item in items:
            content = "\n"+self.tool.replace(item)+"\n"
            contents.append(content.encode('utf-8'))
            #print (content)# 返回博客标题了
        return contents
 
    def getUrl(self,page):
        pattern =re.compile('<span class="atc_title">.*?<a.*?href="(.*?)">.*?</a>.*?</span>',re.S)
        items = re.findall(pattern,page)
        urls = []
        for item in items:
            url = item
            #urls.append(url.encode('utf-8')) #会导致首部多出字符 b'
            urls.append(url)
            
            #print (url) #返回博客url
        return urls
 
 
    def getText(self,url):
         text=urlopen(url).read().decode('utf-8')
         start=text.find(u"<!-- 正文开始 -->")
         print (start)
         end=text.find(u"<!-- 正文结束 -->")
         print (end)
         text=text[start:end]
         text = re.sub(re.compile('<p.*?>'),"\n    ",text)
         text = re.sub(re.compile('<p>'),"\n    ",text)
         text=re.sub(r'<(S*?)[^>]*>.*?|<.*? /> ','',text)
         text=re.sub(r'&[^>]*?\;',' ',text)
         #return text.encode('utf-8') #编码混乱
         return text
 
    def setFileTitle(self,title):
        if title is not None:
            self.file = open(title + ".doc","w")
        else:
            self.file = open(self.defaultTitle + ".doc","w")
 
 
    def writeData_original(self,contents,urls):
        for item in contents:
            #print('item=='+str(item))
            if self.articleTag == '1':
 
                articleLine = "\n" + str(self.article) + u"--------------------------------------------------------------------------------\n"
                self.file.write(articleLine)
            #self.file.write(item) #TypeError: write() argument must be str, not bytes
            self.file.write(str(item))
            #print item
            #self.file.write(urls[contents.index(item)]) #TypeError: write() argument must be str, not bytes
            self.file.write(urls[contents.index(str(item))])
            #print urls[contents.index(item)]
            text=self.getText(urls[contents.index(item)])   
            print (text)
            self.file.write(str(text))
            self.article += 1

    def writeData(self,contents,urls):
        print(urls)
        for item in urls:
            #item ='http://blog.sina.com.cn/s/blog_67493dad0102uxwl.html'
            print('item=='+str(item))
            if self.articleTag == '1':
 
                articleLine = "\n" + str(self.article) + u"--------------------------------------------------------------------------------\n"
                self.file.write(articleLine)
            #self.file.write(item) #TypeError: write() argument must be str, not bytes
            self.file.write(str(item))
            #print item
            #self.file.write(urls[contents.index(item)]) #TypeError: write() argument must be str, not bytes
            #self.file.write(urls[contents.index(str(item))])
            item=str(item)
            item=item.replace("b'", "");
            #self.file.write(contents[urls.index(item)])

            #print urls[contents.index(item)]
            #text=self.getText(urls[contents.index(item)]) 
            text=self.getText(item)
            #text=self.getText('http://blog.sina.com.cn/s/blog_67493dad0102uxwl.html')
            

            print (text)
            self.file.write(str(text))
            self.article += 1 
 
    def start(self):
        indexPage = self.getPage(1)
        pageNum = self.getPageNum(indexPage)
        title = self.getTitle(indexPage)
        self.setFileTitle(self.fileName)
        if pageNum == None:
            print ("URL已失效,请重试")
            return
        try:
            print ("该博客共有" + str(pageNum) + "页")
            for i in range(1,int(pageNum)+1):
                print ("正在写入第" + str(i) + "页数据")
                page = self.getPage(i)
                contents = self.getContent(page)
                urls =self.getUrl(page)
                self.writeData(contents,urls)
        #except IOError,e:
        except IOError as e:
            print ("写入异常,原因" + e.message)
        finally:
            print ("写入任务完成")
 
 
 
#print u"打开一个新浪博客的博文目录\n如http://blog.sina.com.cn/s/articlelist_1866629225_0_1.html \n那么该博客的代号为1866629225_0_   \n请输入博客代号"
#baseURL = 'http://blog.sina.com.cn/s/articlelist_' + str(raw_input(""))
#baseURL = 'http://blog.sina.com.cn/s/articlelist_' + str(input(""))
#
#假设本例要测试的新浪博客全部文章目录地址为: http://blog.sina.com.cn/s/articlelist_1732853165_0_1.html
baseURL =  'http://blog.sina.com.cn/s/articlelist_1732853165_0_' #这里是直接赋值的形式
#articleTag = raw_input("是否写入文章编号信息,是输入1,否输入0\n")
articleTag = input("是否写入文章编号信息,是输入1,否输入0\n") # 运行后输入 1
#fileName=raw_input("请输入保存文档的名称\n")
fileName=input("请输入保存文档的名称\n") #文件名可以随便输入-------也可以直接赋值  fileName=sina_blog_save
xlbk = XLBK(baseURL,articleTag,fileName)
xlbk.start()

测试的时候爬取新浪博客文章的速度还可以,就是爬取的内容还有进一步优化的空间,后面有时间再优化吧。不得不说python就是强大,各种库不是吹的。

基于互联网精神,在注明出处的前提下本站文章可自由转载!

本文链接:https://ranjuan.cn/python-crawler-sinablog/

赞赏

微信赞赏支付宝赞赏

发表评论