爬虫

定义

网络爬虫,按照一定的规则,自动抓取网络信息的程序或者脚本


简单实现

去除无效请求头,抓取特定标签信息

module01.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#coding=utf-8
import requests
head_me = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7,ja;q=0.6",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "__jsluid=79f3d21d1f3928bb6d698e1aa56b6fa9; UM_distinctid=168ac507d95899-0bdc39e526236b-b781636-1fa400-168ac507d96331; pgv_pvi=2586685440; chkphone=acWxNpxhQpDiAchhNuSnEqyiQuDIO0O0O; browse=CFlZTxUYU0BRV1lAVQJTRFBZSkdeQFBYWVBFRF1RWUxTUVxPXEhLThQ; Hm_lvt_d7682ab43891c68a00de46e9ce5b76aa=1552961840; Hm_lpvt_d7682ab43891c68a00de46e9ce5b76aa=1552961840; ci_session=8a4f47241915fa49ff5ad3b168bdb568f4f3a8b9; CNZZDATA1262179648=469647292-1549077335-%7C1552958777; pgv_si=s1260497920; Hm_lvt_2d0601bd28de7d49818249cf35d95943=1552353494,1552477099,1552705976,1552961909; Hm_lpvt_2d0601bd28de7d49818249cf35d95943=1552961948",
"Host": "www.xxllxx.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

url_me = "http://www.xxllxx.com"
html_test = requests.get(url = url_me)
html_test.encoding='utf-8'
print html_test.text


spider02.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import requests
import json
def pachong(i):
head_me={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0"
}

url_me="https://www.xxllxx.com/courses/ajaxCourses"
post_data={
"courseTag":"",
"courseDiffcuty":"",
"IsExp":"",
"producerId":"",
"orderField":"",
"pageIndex":i,
"tagType":"",
"isOpen":""
}

html_text=requests.post(url=url_me, headers=head_me, data=post_data)
new_text=json.loads(html_text.text)

# print new_text
for name in new_text['course']['result']:
print name['courseName']

for i in range(1,19):
pachong(i)


spider03.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import requests
from bs4 import BeautifulSoup

def jianshu(i):
url_me="http://www.jianshu.com/?page="+str(i)
head_me={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0",
"Accept": "text/html, */*; q=0.01",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Referer": "https://www.jianshu.com/",
"X-CSRF-Token": "/Rn0x3ke4Dk3ZO+MJF70kGHb+8SGKQID1nFsoRj5Tq4N/Le5UUAGnpvjaGQMlQk9vtCNwQXfEKqk10Prm/PK0A==",
"X-INFINITESCROLL": "true",
"X-Requested-With": "XMLHttpRequest",
}

html_text=requests.get(url=url_me, headers=head_me)
#print(html_text.text)
html=BeautifulSoup(html_text.text.encode(html_text.encoding).decode('utf-8'),"html.parser")
a_all=html.find_all('a','title')
p_all=html.find_all('p','abstract')

with open(r"./jianshu.txt","a",encoding='utf-8') as file:
for(a,p) in zip(a_all,p_all):
file.write(a.string)
file.write(p.string+'\n')
file.write("http://www.jianshu.com"+a.get('href')+'\n\n\n')

for i in (1,3):
jianshu(i)

您的支持是我前进的动力!