#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Time:2018/6/4
"""
第一步:获得网站
http://mas.58.com/job/?PGTID=0d100000-007f-77d5-31ab-9a3915d1f878&ClickID=2#&key=
第二步:获得该主页下的所有招聘url,和 薪资 补助 名称 要求
第三步:通过遍历所有的招聘url,获得该工作的地址
第四步:把这些信息放入数据表中
"""
import requests
import re
from bs4 import BeautifulSoup
import threading
import xlwt
infs=[]#用列表的方式储存信息
infnames=[]
infadds=[]
infsalaries=[]
infcops=[]
infwels=[]
infquals=[]
infexers=[]
def getUrl(Urls,html):
soup=BeautifulSoup(html,'lxml')
def writeExcel():
global infs,infnames,infadds,infsalaries,infcops,infwels,infquals,infexers
f=xlwt.Workbook()
sheet1=f.add_sheet(u'sheet',cell_overwrite_ok=True)
for i in range(len(infnames)):
sheet1.write(i,0,infnames[i])#将这些信息写入excel
sheet1.write(i,1,infadds[i])
sheet1.write(i,2,infsalaries[i])
sheet1.write(i,3,infcops[i])
sheet1.write(i,4,infwels[i])
sheet1.write(i,5,infquals[i])
sheet1.write(i,6,infexers[i])
f.save('58.xls')
def getInf(soup):
global infs,infnames,infadds,infsalaries,infcops,infwels,infquals,infexers
bsInfos=soup.find_all('li',class_='job_item clearfix')
for bsinfo in bsInfos:
address=bsinfo.find('span',class_="address")#地址
address=address.string.strip()
name=bsinfo.find('span',class_="name")#职位名称
name=re.findall(r'[\u4e00-\u9fa5]+.[\u4e00-\u9fa5]+',str(name))
name=''.join(name)
salary=bsinfo.find('p',class_="job_salary")#工资多少
salary=re.search(r'(\d*-\d*)|([\u4e00-\u9fa5]+)',str(salary))
salary=salary.group()
try:
unit=bsinfo.find('i',class_="unit")#工资单位
unit=unit.string.strip()
except:
unit='元/月'
try:
com=bsinfo.find('div',class_="comp_name")#公司名称
company=re.search(r'[\u4e00-\u9fa5]+',str(com))
comp=company.group()
except:
comp='无'
try:
wel=bsinfo.find('div',class_="job_wel clearfix")
welfares=wel.contents
wels=''
for welfare in welfares:
wels=wels+welfare.string.strip()
wels=wels+' '
except:
wels='无'
xueli=bsinfo.find('span',class_="xueli")
xueli=re.search(r'[\u4e00-\u9fa5]+',str(xueli))
xueli=xueli.group()
exer=bsinfo.find('span',class_="jingyan")
exer=re.search(r'[\u4e00-\u9fa5]+|\d?-\d?年',str(exer))
exer=exer.group()
final='职位名称:'+name+'职位'+' '+"公司:"+comp+' '+"薪水:"+salary+unit+' '+"地址:"+address+' '+'福利:'+wels+' '+'学历:'+xueli+' '+'经验:'+exer
infs.append(final)
infnames.append(name)
infadds.append(address)
infsalaries.append(salary+unit)
infcops.append(comp)
infwels.append(wels)
infquals.append(xueli)
infexers.append(exer)
wels=''
def main():
ths=[]
for i in range(71):
if(i==0):
url='http://mas.58.com/job/?PGTID=0d100000-007f-77d5-31ab-9a3915d1f878&ClickID=2#&key='
elif(i==1):
continue
else:
url='http://mas.58.com/job/'+'/pn'+str(i)+'/?PGTID=0d100000-007f-77d5-31ab-9a3915d1f878&ClickID=2#&key='
html=requests.get(url)
html=html.text
soup=BeautifulSoup(html,'lxml')
getInf(soup)
writeExcel()
"""for i in range(71):
if(i==0):
url='http://mas.58.com/job/?PGTID=0d100000-007f-77d5-31ab-9a3915d1f878&ClickID=2#&key='
elif(i==1):
continue
else:
url='http://mas.58.com/job/'+'/pn'+str(i)+'/?PGTID=0d100000-007f-77d5-31ab-9a3915d1f878&ClickID=2#&key='
html=requests.get(url)
html=html.text
soup=BeautifulSoup(html,'lxml')
th=threading.Thread(target=getInf,args=[soup])
th.start()
ths.append(th)
for i in ths:
i.join()
print("一共有%s条信息"%(len(infs)))"""
main()
因篇幅问题不能全部显示,请点此查看更多更全内容