RandomExperiments/python/crawler.py at main · Ax9D/RandomExperiments · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
'''
A static site crawler

used like this
c=Crawler("https://www.google.com"),maxSites=10000,updateInterval=30)
c.crawl()

It creates an output file with name in the format: <timestamp><site_url>.txt

maxSites is the maximum no. of sites that the crawler will check
updateInterval is the amount of time the crawler will wait before updating the output file
An updateInterval of 30 means it will update the output file every 30 seconds


I was pretty new to python when I wrote this, so your code reading displeasure is deeply regretted
'''
import requests
from bs4 import BeautifulSoup
import time

class Crawler:
	def __init__(self, starting, maxSites,updateInterval):
		self.sites = []
		self.sites.append(starting)
		self.maxSites = maxSites
		self.startTime=int(time.time())
		self.updateInterval=updateInterval
		self.prevUpdateTime=time.time()
		self.loggedTillIndex=0

		siteName=starting[starting.find("//")+2:]
		siteName=siteName[0:siteName.find('/')]
		self.filePath=str(self.startTime)+siteName+".txt"

	def parseLinks(self, htm):
		soup = BeautifulSoup(htm, 'html.parser')
		a_elems = soup.find_all('a')
		links = []
		for a_elem in a_elems:
			href=a_elem.get('href')
			if(href!=None):
				href=href.strip()
				if(href.find("https")==0 or href.find("http")==0):
					#print(href)
					links.append(href)

		return links

	def isInList(self,s):

		# TODO: Implement effieicntly
		for site in self.sites:
			if(s==site):
				return True
		return False

	def visit(self,addr):
		resp=None
		try:
			resp=requests.get(addr)
		except:
			print("Failed to get:\n"+addr)
			return
		links=self.parseLinks(resp.text)
		for link in links:
			if(not self.isInList(link)):
				print("Found: "+link)
				self.sites.append(link)

	def crawl(self):
		count=0
		while(count<len(self.sites) and len(self.sites)<=self.maxSites):
			curTime=time.time()
			if(curTime-self.prevUpdateTime>=self.updateInterval):
					self.prevUpdateTime=curTime
					self.logToFile()

			self.visit(self.sites[count])
			count=count+1

		if(self.loggedTillIndex<len(self.sites)-1):
			self.logToFile()
		#print(self.sites)

	def logToFile(self):
		print("No. of sites now: "+str(len(self.sites)))
		f=open(self.filePath,'a+')
		numSites=len(self.sites)
		for i in range(self.loggedTillIndex+1,numSites):
				f.write(self.sites[i]+"\n")
		self.loggedTillIndex=numSites-1
		f.close()


c=Crawler("https://"+input("Enter site address:\n"),maxSites=10000,updateInterval=30)
c.crawl()