微慑信息网

某站tags获取 - vulsee.com

某站tags获取,用于语义分析

#coding:utf-8
import requests
import re
import ssl
import urllib3
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,as_completed

ssl._create_default_https_context = ssl._create_unverified_context
requests.packages.urllib3.disable_warnings()

def getdate(html):
	reg = (r'<span class="tags">Tags: (.*?)</a></span>')
	listre = re.compile(reg)
	mylist = re.findall(listre, html)
	mylist = mylist[0]
	return mylist

def getA(html):
	reg = (r'<a href=.*?rel="tag">(.*?)</a>')
	listre = re.compile(reg)
	mylist = re.findall(listre, html)
	mylist = mylist[0]
	return mylist

def start(aid):
	try:
		url = 'https://www.xxx.org/ar/%d.shtml' % aid
		req = requests.get(url,verify=False)
		if req.status_code==200:
			html = req.text
			res = (getdate(html))
			soup = BeautifulSoup(res,'html.parser')
			for i in  (soup.find_all('a')):
				tag = (getA(str(i)))
				tag = tag.lower()
				tagslist.append(tag)
				with open('secplus_tags_res.txt','a') as f:
					f.write(str(tag)+'\n')
		else:
			print ('id',aid,',status_code:',req.status_code)
	except Exception as e:
		print (aid,e)

def main():
	global tagslist
	tagslist=[]
	idlist = list(range(1194202))
	with ThreadPoolExecutor(max_workers=20) as pool:
		results = pool.map(start,idlist)
		for i in results:
			if (i!=None):
				print (i)

if __name__ == '__main__':
	main()

目前跑了4800多个:


后续还需要对该文本进行数据处理,排除干扰、非需求数据

 

赞(0) 打赏
转载请附本站链接,未经允许不得转载,,谢谢:微慑信息网-VulSee.com » 某站tags获取 - vulsee.com

评论 抢沙发

微慑信息网 专注工匠精神

微慑信息网-VulSee.com-关注前沿安全态势,聚合网络安全漏洞信息,分享安全文档案例

访问我们联系我们

觉得文章有用就打赏一下文章作者

非常感谢你的打赏,我们将继续提供更多优质内容,让我们一起创建更加美好的网络世界!

支付宝扫一扫

微信扫一扫

登录

找回密码

注册