网络爬虫入门

主要使用requests和BeautifulSoup库,BeautifulSoup使用CSS过滤器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# encoding=utf-8
import requests
import json
from bs4 import BeautifulSoup
# from bs4.element import Tag
import pandas as pd
import time

def getOk(base):
print(base)
res = []
response = requests.get(base)
soup = BeautifulSoup(response.text,'lxml')
for i in soup.select('.zg_spbt'):
c = i.children
next(c)
aTag = next(c)
href = aTag.get('href')
title = aTag.get_text()
# print(href,title)
response1 = requests.get(href)
response1.encoding='gbk'
soup = BeautifulSoup(response1.text,'lxml')
title = title.replace('\n','',-1)
title = title.replace('\r','',-1)
scrText = soup.select('.zg_list_zi')[0].get_text()
if ('人力' in scrText or '人事' in scrText) and ('本科' in scrText or ('硕士' not in scrText and '研究生' not in scrText)):
print(href,title)
res.append({
'链接':href,
'title':title,
'文章过短需要二次查看':False,
'文章长度':len(scrText)
})
elif len(scrText)<1000 and '研究生' not in scrText and '硕士' not in scrText: # 有的文章只配了图片
print(href,title)
res.append({
'链接':href,
'title':title,
'文章过短需要二次查看':True,
'文章长度':len(scrText)
})
time.sleep(0.5)
return res
def main():
preUrl = 'http://www.zggqzp.com/zpxx/'
pagesUrl = ['60_60_2_0.html']
res = []
for i in range(2,9): # 2020,2021招聘信息只有前8页有
pagesUrl.append('60_60_2_%d.html'%i)
for i in pagesUrl:
res+=getOk(preUrl+i)
print(res)
# print(minLen,minHref)
df = pd.DataFrame(res)
df.to_excel('work.xlsx',columns=['链接','title','文章过短需要二次查看','文章长度'])
if __name__=='__main__':
main()
print('ok')
# for i in soup.select('.zg_spsj'):
# print(i)