爬虫部分没有记录笔记,爬虫中遇见的反爬中的问题见博客中的《Boss直聘反爬解决方法》

创建应用

本项目采用Django框架,先创建一个新应用

输入python manage.py startapp myApp,创建一个新的 Django 应用(App)

生成的目录结构通常如下:

1
2
3
4
5
6
7
8
9
myApp/
__init__.py # 这是一个空文件,用于告诉 Python 这个目录是一个 Python 包。
admin.py # 用于注册模型到 Django 的管理后台。
apps.py # 包含应用的配置信息。
migrations/ # 用于存放数据库迁移文件。
__init__.py
models.py # 用于定义数据库模型。
tests.py # 用于编写应用的测试用例。
views.py # 用于编写视图函数或类。

定义数据库模型

  1. models.py 文件中,定义了三个 Django 模型:JobInfoUserHistory。这些模型对应于数据库中的表,每个模型的字段对应于表中的列。通过定义模型,实际上是在定义数据库中的表结构 Django 会根据这些模型自动生成相应的数据库表。

有以下好处:

  • 数据持久化:这些模型将用于在数据库中存储和管理数据。例如,可以通过 JobInfo 模型来存储和查询岗位信息,通过 User 模型来管理用户信息。
  • 数据关系:通过外键(如 History 模型中的 jobuser),可以在不同的模型之间建立关系,从而实现复杂的数据查询和操作。
  • 数据库迁移:在定义好模型后,可以使用 Django 的迁移工具(python manage.py makemigrationspython manage.py migrate)来创建或更新数据库表结构。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from django.db import models


# Create your models here.
class JobInfo(models.Model):
id = models.AutoField('id', primary_key=True)
title = models.CharField("岗位名字", max_length=255, default='')
address = models.CharField('省会', max_length=255, default='')
type = models.CharField('职业', max_length=255, default='')
educational = models.CharField('学历', max_length=255, default='')
workExperience = models.CharField('工作经验', max_length=255, default='')
workTag = models.CharField('工作标签', max_length=255, default='')
salary = models.CharField('薪资', max_length=255, default='')
salaryMonth = models.CharField('年终奖', max_length=255, default='')
companyTags = models.CharField('公司福利', max_length=255, default='')
hrWork = models.CharField('人事职位', max_length=255, default='')
hrName = models.CharField('人事名字', max_length=255, default='')
pratice = models.BooleanField('是否为实习单位', max_length=255, default='')
companyTitle = models.CharField('公司名称', max_length=255, default='')
companyAvatar = models.CharField('公司头像', max_length=255, default='')
companyNature = models.CharField('公司性质', max_length=255, default='')
companyStatus = models.CharField('公司状态', max_length=255, default='')
companyPeople = models.CharField('公司人数', max_length=255, default='')
detailUrl = models.CharField('详情地址', max_length=255, default='')
companyUrl = models.CharField('公司详情地址 ', max_length=255, default='')
companyUrl = models.CharField('公司详情地址', max_length=255, default='')
createTime = models.DateTimeField('创建时间', auto_now_add=True)
dist = models.CharField('行政区', max_length=25, default='')

class Meta:
db_table = "jobInfo"


class User(models.Model):
id = models.AutoField('id', primary_key=True)
username = models.CharField('用户名', max_length=255, default='')
password = models.CharField('密码', max_length=25, default='')
educational = models.CharField('学历', max_length=255, default='')
workExpirence = models.CharField('工作经验', max_length=255, default='')
address = models.CharField('意向城市', max_length=255, default='')
work = models.CharField('意向岗位', max_length=25, default='')
avatar = models.FileField('用户头像', upload_to="avatar", default="avatar/default.png")
createTime = models.DateTimeField('创建时间', auto_now_add=True)

class Meta:
db_table = "user"


class History(models.Model):
id = models.AutoField('id', primary_key=True)
job = models.ForeignKey(JobInfo, on_delete=models.CASCADE)
user = models.ForeignKey(User, on_delete=models.CASCADE)
count = models.IntegerField("点击次数", default=1)

class Meta:
db_table = "histroy"
  1. graduationdesign中的settings.pyINSTALLED_APPS 中添加应用——myApp,然后进行数据库的迁移,在 mysql 数据库中生成相关的表

用以下指令

1
2
python manage.py makemigrations
python manage.py migrate
  1. 修改爬虫代码,添加保存数据库中的功能,然后向数据库中存入数据

爬boss直聘原码(附带存入数据库)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import csv
import os
import json
import pandas as pd
import django

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'graduationdesign.settings')
django.setup()
from myApp.models import JobInfo


class spider(object):
def __init__(self, type, page):
self.type = type # 岗位关键字
self.page = int(page) # 页码数(当前页码数)
self.spiderUrl = "https://www.zhipin.com/web/geek/job?query=%s&city=100010000&page=%s"
self.brower = self.startBrower() # 只初始化一次浏览器实例

def startBrower(self):
service = Service("./chromedriver.exe")
options = webdriver.ChromeOptions()

# 浏览器的复用(防止反爬)
# 打开cmd 输入 chrome.exe -remote-debugging-port=9222
options.add_experimental_option('debuggerAddress', 'localhost:9222')

# 方法二:使用过手机热点,每次重新打开手机热点,IP地址也不一样
# options.add_experimental_option("excludeSwitches", ['enable-automation'])
brower = webdriver.Chrome(service=service, options=options)
return brower

def main(self, page): # page为总共要爬取的page总数
if self.page > page: return
# brower = self.startBrower()
print("正在爬取的页面路径:" + self.spiderUrl % (self.type, self.page))
self.brower.get(self.spiderUrl % (self.type, self.page))
time.sleep(15)
job_list = self.brower.find_elements(by=By.XPATH, value='//ul[@class="job-list-box"]/li') # value参数使用XPATH语法
for index, job in enumerate(job_list):
try:
jobData = []
print("正在爬取第%d个数据" % (index + 1))
# title
title = job.find_element(by=By.XPATH,
value=".//a[@class='job-card-left']/div[contains(@class,'job-title')]"
"/span[@class='job-name']").text
# address
addresses = job.find_element(by=By.XPATH,
value=".//a[@class='job-card-left']/div[contains(@class,'job-title')]"
"/span[@class='job-area-wrapper']/span").text.split('·')
address = addresses[0]
if len(addresses) != 1:
dist = addresses[1]
else:
dist = ''

# type
type = self.type

# educational, workExperience
tag_list = job.find_elements(by=By.XPATH,
value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]"
"/ul[@class='tag-list']/li")
if len(tag_list) == 2:
educational = tag_list[1].text
workExperience = tag_list[0].text
else:
educational = tag_list[2].text
workExperience = tag_list[1].text

# hrName
hrName = job.find_element(by=By.XPATH,
value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]"
"/div[@class='info-public']").text
# hrWork
hrWork = job.find_element(by=By.XPATH,
value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]"
"/div[@class='info-public']/em").text
# workTag
workTag = job.find_elements(by=By.XPATH,
value="./div[contains(@class,'job-card-footer')]/ul[@class='tag-list']/li")
workTag = json.dumps(list(map(lambda x: x.text, workTag)))

# pratice
pratice = 0

salaries = job.find_element(by=By.XPATH,
value=".//a[@class='job-card-left']/div[contains(@class,'job-info')]"
"/span[@class='salary']").text

if salaries.find('K') != -1:
salaries = salaries.split('·')
if len(salaries) == 1:
# salary
salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K', '').split('-')))
# salaryMonth
salaryMonth = '0薪'
else:
# salary
salary = list(map(lambda x: int(x) * 1000, salaries[0].replace('K', '').split('-')))
# salaryMonth
salaryMonth = salaries[1]
else:
# salary
salary = list(map(lambda x: int(x), salaries.replace('元/天', '').split('-')))
# salaryMonth
salaryMonth = '0薪'
pratice = 1

# companyTitle
companyTitle = job.find_element(by=By.XPATH,
value=".//div[@class='job-card-right']/div[@class='company-info']/h3/a").text
# companyAvatar
companyAvatar = job.find_element(by=By.XPATH,
value=".//div[@class='job-card-right']/div[@class='company-logo']/a/img").get_attribute(
"src")
companyInfos = job.find_elements(by=By.XPATH,
value=".//div[@class='job-card-right']/div[@class='company-info']/ul[@class='company-tag-list']/li")
if len(companyInfos) == 3:
# companyNature
companyNature = companyInfos[0].text
# companyStatus
companyStatus = companyInfos[1].text
# companyPeople
companyPeoples = companyInfos[2].text
if companyPeoples != "10000人以上":
companyPeople = list(map(lambda x: x, companyInfos[2].text.replace('人', '').split('-')))
else:
companyPeople = [0, 10000]
else:
# companyNature
companyNature = companyInfos[0].text
# companyStatus
companyStatus = '未融资'
# companyPeople
companyPeoples = companyInfos[1].text
if companyPeoples != "10000人以上":
companyPeople = list(map(lambda x: x, companyInfos[1].text.replace('人', '').split('-')))
else:
companyPeople = [0, 10000]

# companyTags
companyTags = job.find_element(by=By.XPATH,
value="./div[contains(@class,'job-card-footer')]/div[@class='info-desc']").text
if not companyTags:
companyTags = '无'
else:
companyTags = json.dumps(companyTags.split(','))

# detailUr
detailUrl = job.find_element(by=By.XPATH,
value=".//a[@class='job-card-left']").get_attribute('href')
# companyUr
companyUrl = job.find_element(by=By.XPATH,
value=".//div[@class='job-card-right']/div[@class='company-info']/h3/a").get_attribute(
'href')

jobData.append(title)
jobData.append(address)
jobData.append(type)
jobData.append(educational)
jobData.append(workExperience)
jobData.append(workTag)
jobData.append(salary)
jobData.append(salaryMonth)
jobData.append(companyTags)
jobData.append(hrWork)
jobData.append(hrName)
jobData.append(pratice)
jobData.append(companyTitle)
jobData.append(companyAvatar)
jobData.append(companyNature)
jobData.append(companyStatus)
jobData.append(companyPeople)
jobData.append(detailUrl)
jobData.append(companyUrl)
jobData.append(dist)

self.save_to_csv(jobData)
except:
pass

self.page += 1;
self.main(page)

def save_to_csv(self, rowData):
# with open('./temp1.csv', 'a', newline='', encoding='utf8') as wf:
with open('./temp2.csv', 'a', newline='', encoding='utf-8-sig') as wf:
writer = csv.writer(wf)
writer.writerow(rowData)

def clear_csv(self):
df = pd.read_csv('./temp2.csv')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df['salaryMonth'] = df['salaryMonth'].map(lambda x: x.replace('薪', ''))
print("总数为%d" % df.shape[0])
return df.values

def save_to_sql(self):
data = self.clear_csv()
for job in data:
JobInfo.objects.create(
title=job[0],
address=job[1],
type=job[2],
educational=job[3],
workExperience=job[4],
workTag=job[5],
salary=job[6],
salaryMonth=job[7],
companyTags=job[8],
hrWork=job[9],
hrName=job[10],
pratice=job[11],
companyTitle=job[12],
companyAvatar=job[13],
companyNature=job[14],
companyStatus=job[15],
companyPeople=job[16],
detailUrl=job[17],
companyUrl=job[18],
dist=job[19]
)

def init(self):
if not os.path.exists("./temp2.csv"):
with open('./temp2.csv', 'a', newline='', encoding="utf8") as wf:
writer = csv.writer(wf)
writer.writerow(
["title", "address", "type", "educational", "workExperience", "workTag", "salary", "salaryMonth"
, "companyTags", "hrWork", "hrName", "pratice", "companyTitle", "companyAvatar",
"companyNature", "companyStatus", "companyPeople", "detailUrl", "companyUrl", "dist"])


if __name__ == "__main__":


# JobInfo.objects.all()
# keywords = ["python爬虫", "python后端", "python大数据", "python数据分析","AI","python"]
# keywords = ["运维", "测试","机器学习","算法工程师"]

# spiderObj = spider("机器学习","1")
spiderObj = spider("算法工程师","1")
# spiderObj.init()
# spiderObj.main(10)
spiderObj.save_to_sql()