Python爬虫,微信公众号文章采集工具
来源:https://zhuanlan.zhihu.com/p/111352875
#微信公众号文章采集
import requests
from lxml import etree
import re
import os
def get_con(url):
#url="https://mp.weixin.qq.com/s?src=11×tamp=1562661256&ver=1717&signature=d7T*ZXIp0YZObCDhUTUEkZ-R4ph9iFZF5jpMEczyoxdd-OdhP7EU0TdN9isXUaBkqOVurRBdMzMOW4b-6FC44PbuZ9kC31VBEVk0li-KJ47rct*frYgFh4uYBcdxJ8Xc&new=1"
html=requests.get(url).text
#print(html)
con=etree.HTML(html)
#print(con)
#获取标题
h2=con.xpath('//h2[@class="rich_media_title"]/text()')
h2=h2[0]
h2=re.sub(r'[\t\n\|\/\<\>\:\*\?\\\" ]','',h2) #去除多余的字符
h2 = re.sub('......', '', h2) # 去除多余的字符
print(h2)
os.makedirs(f'./weixin/{h2}/',exist_ok=True)
#获取作者
span1=con.xpath('//span[@class="rich_media_meta rich_media_meta_text"]/text()')
try:
span1=span1[0]
span1 = re.sub(r'[\t\n ]', '', span1) # 去除多余的字符
except:
span1=''
print(span1)
span2=con.xpath('//span[@class="rich_media_meta rich_media_meta_nickname"]/a/text()')
span2=span2[0]
span2=re.sub(r'[\t\n ]','',span2) #去除多余的字符
print(span2)
if span1=='':
author=span2
else:
author='%s%s%s'%(span2,"-",span1)
print(author)
#获取源码
p_con=''
div=con.xpath('//section/p')
print(len(div))
for p in div:
p = etree.tostring(p, encoding='utf-8')
p = bytes.decode(p)
p_con = p_con + p
print(p_con)
#获取正文
p_text=''
span=con.xpath('//section/p')
print(span)
for p_tex in span:
p_tex=p_tex.xpath('string(.)')
p_text=p_text+p_tex+'\n'
#print(p_tex)
print(p_text)
#保存内容
con_text='%s%s%s%s%s%s%s'%(h2,'\n',author,'\n',p_text,'\n',p_con)
with open(f'./weixin/{h2}/{h2}.txt', 'w',encoding='utf-8') as f:
f.write(con_text)
print(f'保存 {h2} 内容成功!')
#获取图片
p_imgs=con.xpath('//section/p/img')
i=0
for p_img in p_imgs:
#print(p_img.attrib)
img_url=p_img.attrib['data-src']
print(img_url)
if "jpg" in img_url:
img_name=f'{i}.jpg'
if "png" in img_url:
img_name = f'{i}.png'
if "gif" in img_url:
img_name = f'{i}.gif'
print(img_name)
r=requests.get(img_url)
with open(f'./weixin/{h2}/{img_name}', 'wb') as f:
f.write(r.content)
print(f'保存 {img_name} 图片成功!')
i=i+1
print(f'保存 {h2} 所有图片成功!')
if __name__ == '__main__':
url=input("请输入要采集的微信公众号文章地址:")
#url="https://mp.weixin.qq.com/s?src=11×tamp=1562668731&ver=1718&signature=lgnEjVavqA8dhhC8ytzdW2gdk0CWoC7DTLo1ym1Rmp9bSJYwAJEu0ZlxsjhK3qUOG6FtgrWET39PvPP*wkKpyST7ZDG5KADkE7LKJuWo86bo30hYsRr3fkcj8XeVGsHh&new=1"
get_con(url)
备注:
在
h2=re.sub(r'[\t\n\/\<\>\:\*\?\\\” ]’,”,h2) #去除多余的字符
中,有时标题中存在|?等符号,可能会存在问题,可以参考对该处进行调整。