1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
| from selenium import webdriver import time import queue import random
driver = webdriver.Chrome() driver.maximize_window() driver.get('https://weibo.com') time.sleep(10)
''' @method 返回用户的被关注用户和最新博文 ''' def getBlogAndFocuesd(userid): blogClass = '//div[@class="WB_detail"]' focusedClass='//ul[@class="picitems_ul clearfix"]/li/div/p/a[@class="S_txt1"]' startUrl='https://weibo.com/' driver.get(startUrl+userid) time.sleep(5) focusedElems = driver.find_elements_by_xpath(focusedClass) uids = [] for i in focusedElems: url = i.get_property('href') if url: url = url[len(startUrl):].split('?')[0] if '/' in url:url = url[2:] uids.append(url) names = [_.text for _ in focusedElems] blogElems = driver.find_elements_by_xpath(blogClass) blogs = [_.text.replace('\n','',-1) for _ in blogElems] return uids,blogs def controller(maxBlogs,startUsers): userQueue = [] for u in startUsers: userQueue.append(u) cnt=0 f = open('blogs.txt','w') crawledUser = set() while cnt<maxBlogs: selInd = random.randint(0,len(userQueue)) if selInd>=len(userQueue): selInd=len(userQueue)-1 selUserId = userQueue[selInd] del userQueue[selInd] users,blogs = getBlogAndFocuesd(selUserId) crawledUser.add(selUserId) for u in users: if not u in crawledUser: userQueue.append(u) for b in blogs: f.write(selUserId+","+b+'\n') f.flush() cnt+=len(blogs) print("current: ",selUserId,"len:",len(blogs),"sum:",cnt,"process:",cnt/maxBlogs) time.sleep(3) if random.random()>0.5: time.sleep(2) f.close() driver.close() if __name__ == '__main__': userid = '2259906485' startUsers=['2259906485','shangganju','1046733017','6204415793','6591509611','rmrb','CHINAD8','5857559022','5721826695','1739663283'] controller(50000,startUsers)
|