基于seleium的网络爬虫

配置环境

1
sudo apt install chromium chromium-chromedriver python3-seleium

安装chromium时可能需要代理,因为可能使用snap源

使用

关于xpath(实践中发现非常好用!)

https://www.w3school.com.cn/xpath/xpath_syntax.asp

使用下面的爬虫爬取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from selenium import webdriver
import time
import queue
import random

#使用chromedriver,上一篇讲过
driver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://weibo.com')
time.sleep(10)

'''
@method 返回用户的被关注用户和最新博文
'''
def getBlogAndFocuesd(userid):
blogClass = '//div[@class="WB_detail"]'
focusedClass='//ul[@class="picitems_ul clearfix"]/li/div/p/a[@class="S_txt1"]'
startUrl='https://weibo.com/'
driver.get(startUrl+userid)
time.sleep(5)
focusedElems = driver.find_elements_by_xpath(focusedClass)
uids = []
for i in focusedElems:
url = i.get_property('href')
if url:
url = url[len(startUrl):].split('?')[0]
if '/' in url:url = url[2:]
uids.append(url)
names = [_.text for _ in focusedElems]
blogElems = driver.find_elements_by_xpath(blogClass)
blogs = [_.text.replace('\n','',-1) for _ in blogElems]
# driver.quit()
# print(uids)
# print(names)
# print(blogs)
# print("crawled",uids,names)
return uids,blogs
def controller(maxBlogs,startUsers):
userQueue = []
for u in startUsers:
userQueue.append(u)
cnt=0
f = open('blogs.txt','w')
crawledUser = set()
while cnt<maxBlogs:
selInd = random.randint(0,len(userQueue))
if selInd>=len(userQueue):
selInd=len(userQueue)-1

selUserId = userQueue[selInd] # 随机选取一个用户
del userQueue[selInd]
users,blogs = getBlogAndFocuesd(selUserId)
crawledUser.add(selUserId)
for u in users:
if not u in crawledUser:
userQueue.append(u)
for b in blogs:
f.write(selUserId+","+b+'\n')
f.flush()
cnt+=len(blogs)
print("current: ",selUserId,"len:",len(blogs),"sum:",cnt,"process:",cnt/maxBlogs)
time.sleep(3)
if random.random()>0.5:
time.sleep(2)
f.close()
driver.close()

if __name__ == '__main__':
userid = '2259906485'
startUsers=['2259906485','shangganju','1046733017','6204415793','6591509611','rmrb','CHINAD8','5857559022','5721826695','1739663283']
controller(50000,startUsers)