2021-02-05
网络爬虫入门

主要使用requests和BeautifulSoup库，BeautifulSoup使用CSS过滤器
# encoding=utf-8
import requests
import json
from bs4 import BeautifulSoup
# from bs4.element import Tag
import pandas as pd
import time

def getOk(base):
    print(base)
    res = []
    response = requests.get(base)
    soup = BeautifulSoup(response.text,'lxml')
    for i in soup.select('.zg_spbt'):
        c = i.children
        next(c)
        aTag = next(c)
        href = aTag.get('href')
        title = aTag.get_text()
        # print(href,title)
        response1 = requests.get(href)
        response1.encoding='gbk'
        soup = BeautifulSoup(response1.text,'lxml')
        title = title.replace('\n','',-1)
        title = title.replace('\r','',-1)
        scrText = soup.select('.zg_list_zi')[0].get_text()
        if ('人力' in scrText or '人事' in scrText) and ('本科' in scrText or ('硕士' not in scrText and '研究生' not in scrText)):
            print(href,title)
            res.append({
                '链接':href,
                'title':title,
                '文章过短需要二次查看':False,
                '文章长度':len(scrText)
            })
        elif len(scrText)<1000 and '研究生' not in scrText and '硕士' not in scrText: # 有的文章只配了图片
            print(href,title)
            res.append({
                '链接':href,
                'title':title,
                '文章过短需要二次查看':True,
                '文章长度':len(scrText)
            })
        time.sleep(0.5)
    return res
def main():
    preUrl = 'http://www.zggqzp.com/zpxx/'
    pagesUrl = ['60_60_2_0.html']
    res = []
    for i in range(2,9): # 2020,2021招聘信息只有前8页有
        pagesUrl.append('60_60_2_%d.html'%i)
    for i in pagesUrl:
        res+=getOk(preUrl+i)
    print(res)
    # print(minLen,minHref)
    df = pd.DataFrame(res)
    df.to_excel('work.xlsx',columns=['链接','title','文章过短需要二次查看','文章长度'])
if __name__=='__main__':
    main()
    print('ok')
# for i in soup.select('.zg_spsj'):
#     print(i)