爬虫 | TuringGu - Info Security

定义

网络爬虫，按照一定的规则，自动抓取网络信息的程序或者脚本

简单实现

去除无效请求头，抓取特定标签信息

module01.py

#coding=utf-8
import requests
head_me = {
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
	"Accept-Encoding": "gzip, deflate, br",
	"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7,ja;q=0.6",
	"Cache-Control": "max-age=0",
	"Connection": "keep-alive",
	"Cookie": "__jsluid=79f3d21d1f3928bb6d698e1aa56b6fa9; UM_distinctid=168ac507d95899-0bdc39e526236b-b781636-1fa400-168ac507d96331; pgv_pvi=2586685440; chkphone=acWxNpxhQpDiAchhNuSnEqyiQuDIO0O0O; browse=CFlZTxUYU0BRV1lAVQJTRFBZSkdeQFBYWVBFRF1RWUxTUVxPXEhLThQ; Hm_lvt_d7682ab43891c68a00de46e9ce5b76aa=1552961840; Hm_lpvt_d7682ab43891c68a00de46e9ce5b76aa=1552961840; ci_session=8a4f47241915fa49ff5ad3b168bdb568f4f3a8b9; CNZZDATA1262179648=469647292-1549077335-%7C1552958777; pgv_si=s1260497920; Hm_lvt_2d0601bd28de7d49818249cf35d95943=1552353494,1552477099,1552705976,1552961909; Hm_lpvt_2d0601bd28de7d49818249cf35d95943=1552961948",
	"Host": "www.xxllxx.com",
	"Upgrade-Insecure-Requests": "1",
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

url_me = "http://www.xxllxx.com"
html_test = requests.get(url = url_me)
html_test.encoding='utf-8'
print html_test.text

spider02.py

import requests
import json
def pachong(i):
	head_me={
		"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0"
	}

	url_me="https://www.xxllxx.com/courses/ajaxCourses"
	post_data={
		"courseTag":"",
		"courseDiffcuty":"",
		"IsExp":"",
		"producerId":"",
		"orderField":"",
		"pageIndex":i,
		"tagType":"",
		"isOpen":""
	}

	html_text=requests.post(url=url_me, headers=head_me, data=post_data)
	new_text=json.loads(html_text.text)

	# print new_text
	for name in new_text['course']['result']:
		print name['courseName']

for i in range(1,19):
	pachong(i)

spider03.py

import requests
from bs4 import BeautifulSoup

def jianshu(i):
  url_me="http://www.jianshu.com/?page="+str(i)
  head_me={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0",
    "Accept": "text/html, */*; q=0.01",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate",
    "Referer": "https://www.jianshu.com/",
    "X-CSRF-Token": "/Rn0x3ke4Dk3ZO+MJF70kGHb+8SGKQID1nFsoRj5Tq4N/Le5UUAGnpvjaGQMlQk9vtCNwQXfEKqk10Prm/PK0A==",
    "X-INFINITESCROLL": "true",
    "X-Requested-With": "XMLHttpRequest",
  }

  html_text=requests.get(url=url_me, headers=head_me)
  #print(html_text.text)
  html=BeautifulSoup(html_text.text.encode(html_text.encoding).decode('utf-8'),"html.parser")
  a_all=html.find_all('a','title')
  p_all=html.find_all('p','abstract')

  with open(r"./jianshu.txt","a",encoding='utf-8') as file:
    for(a,p) in zip(a_all,p_all):
      file.write(a.string)
      file.write(p.string+'\n')
      file.write("http://www.jianshu.com"+a.get('href')+'\n\n\n')

for i in (1,3):
  jianshu(i)