某站tags获取,用于语义分析
#coding:utf-8
import requests
import re
import ssl
import urllib3
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,as_completed
ssl._create_default_https_context = ssl._create_unverified_context
requests.packages.urllib3.disable_warnings()
def getdate(html):
reg = (r'<span class="tags">Tags: (.*?)</a></span>')
listre = re.compile(reg)
mylist = re.findall(listre, html)
mylist = mylist[0]
return mylist
def getA(html):
reg = (r'<a href=.*?rel="tag">(.*?)</a>')
listre = re.compile(reg)
mylist = re.findall(listre, html)
mylist = mylist[0]
return mylist
def start(aid):
try:
url = 'https://www.xxx.org/ar/%d.shtml' % aid
req = requests.get(url,verify=False)
if req.status_code==200:
html = req.text
res = (getdate(html))
soup = BeautifulSoup(res,'html.parser')
for i in (soup.find_all('a')):
tag = (getA(str(i)))
tag = tag.lower()
tagslist.append(tag)
with open('secplus_tags_res.txt','a') as f:
f.write(str(tag)+'\n')
else:
print ('id',aid,',status_code:',req.status_code)
except Exception as e:
print (aid,e)
def main():
global tagslist
tagslist=[]
idlist = list(range(1194202))
with ThreadPoolExecutor(max_workers=20) as pool:
results = pool.map(start,idlist)
for i in results:
if (i!=None):
print (i)
if __name__ == '__main__':
main()
目前跑了4800多个:
后续还需要对该文本进行数据处理,排除干扰、非需求数据