微慑信息网

某站tags获取 – vulsee.com

某站tags获取,用于语义分析

#coding:utf-8
import requests
import re
import ssl
import urllib3
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,as_completed

ssl._create_default_https_context = ssl._create_unverified_context
requests.packages.urllib3.disable_warnings()

def getdate(html):
	reg = (r'<span class="tags">Tags: (.*?)</a></span>')
	listre = re.compile(reg)
	mylist = re.findall(listre, html)
	mylist = mylist[0]
	return mylist

def getA(html):
	reg = (r'<a href=.*?rel="tag">(.*?)</a>')
	listre = re.compile(reg)
	mylist = re.findall(listre, html)
	mylist = mylist[0]
	return mylist

def start(aid):
	try:
		url = 'https://www.xxx.org/ar/%d.shtml' % aid
		req = requests.get(url,verify=False)
		if req.status_code==200:
			html = req.text
			res = (getdate(html))
			soup = BeautifulSoup(res,'html.parser')
			for i in  (soup.find_all('a')):
				tag = (getA(str(i)))
				tag = tag.lower()
				tagslist.append(tag)
				with open('secplus_tags_res.txt','a') as f:
					f.write(str(tag)+'\n')
		else:
			print ('id',aid,',status_code:',req.status_code)
	except Exception as e:
		print (aid,e)

def main():
	global tagslist
	tagslist=[]
	idlist = list(range(1194202))
	with ThreadPoolExecutor(max_workers=20) as pool:
		results = pool.map(start,idlist)
		for i in results:
			if (i!=None):
				print (i)

if __name__ == '__main__':
	main()

目前跑了4800多个:


后续还需要对该文本进行数据处理,排除干扰、非需求数据

 

本文标题:某站tags获取 – vulsee.com
本文链接:
(转载请附上本文链接)
https://vulsee.com/archives/vulsee_2022/0107_16043.html
转载请附本站链接,未经允许不得转载,,谢谢:微慑信息网-VulSee.com » 某站tags获取 – vulsee.com
分享到: 更多 (0)

评论 抢沙发

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

微慑信息网 专注工匠精神

访问我们联系我们