twc829 / blog_20160526_1_108643

来自CSDN博客:单线程爬虫 http://blog.csdn.net/twc829/article/details/51510765#

  最后更新时间 2016-06-06 22:13:08
blog_20160526_1_108643 6行 Python
Raw
 1
 2
 3
 4
 5
 6
#-*-coding:utf8-*-
import requests

html=requests.get('http://www.nowcoder.com/courses')

print html.text
blog_20160526_2_5015685 15行 Python
Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
import re
import requests

# 相当于面具,让访问的网站误以为是浏览器再查看源代码
# 获取User-Agent内容:在该网页右击选择审核元素,切换到Network栏,刷新页面,随便点击一项,在Headers一栏最下方就有相关信息
# hea是字典,键是User-Agent,值就是后面一长串内容
hea={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36'}

# 在此处调用hea
html=requests.get('http://jp.tingroom.com/yuedu/yd300p/',headers=hea)

# 将编码转为utf-8,否则中文会显示乱码
html.encoding='utf-8'

print html.text
blog_20160526_3_5189626 15行 Python
Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
import re
import requests

hea={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36'}

html=requests.get('http://jp.tingroom.com/yuedu/yd300p/',headers=hea)

html.encoding='utf-8'

# print html.text

course=re.findall('<span style="color:#666666;">(.*?)</span></p></li>',html.text,re.S)

for i in course:
print i
blog_20160606_4_9025231 1行 Python
Raw
 1
<span style="font-family: Arial, Helvetica, sans-serif;">'page':'1'</span>
blog_20160606_4_6247144 12行 Python
Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
#-*-coding:utf8-*-
import requests
import re
url='https://www.crowdfunder.com/browse/deals&template=false'
data={ # 字典,有2个元素,构造Form Data
'entities_only':'true',
'page':'1' # 可修改页数值
}
html_post=requests.post(url,data=data)
title=re.findall('"card-title">(.*?)</div>',html_post.text,re.S)
for each in title:
print each
blog_20160606_5_8057287 29行 Python
Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
#-*-coding:utf8-*-
import requests
import re

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

url='http://www.jikexueyuan.com/course/?pageNum=1'

html=requests.get(url).text
course=re.findall('(<li id=".*?</li>)',html,re.S)
info={}

for each in course:
info['title']=re.search('class="lessonimg" title="(.*?)" alt=',each,re.S).group(1)
info['detail'] = re.search('display: none;">(.*?)</p>', each, re.S).group(1)
temp=re.findall('<em>(.*?)</em>',each,re.S)
info['date']=temp[0]
info['class']=temp[1]
info['number']=re.search('"learn-number">(.*?)</em>',each,re.S).group(1)

f = open('info.txt', 'a')
f.writelines('title: '+info['title']+'\n')
f.writelines('detail: ' + info['detail'] + '\n')
f.writelines('date: ' + info['date'] + '\n')
f.writelines('class: ' + info['class'] + '\n')
f.writelines('number: ' + info['number'] + '\n\n')
f.close()
blog_20160606_6_6279380 39行 Python
Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
#-*-coding:utf8-*-
import requests
import re

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

url='http://www.jikexueyuan.com/course/?pageNum=1'

cur_url=int(re.search('pageNum=(\d+)',url,re.S).group(1))
page_group=[]
for i in range(1,21):
links=re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S)
page_group.append(links)



for link in page_group:
print u'正在处理页面:'+link
html=requests.get(link).text
course=re.findall('(<li id=".*?</li>)',html,re.S)
info={}

for each in course:
info['title']=re.search('class="lessonimg" title="(.*?) alt=',each,re.S).group(1)
info['detail'] = re.search('display: none;">(.*?)</p>', each, re.S).group(1)
temp=re.findall('<em>(.*?)</em>',each,re.S)
info['date']=temp[0]
info['class']=temp[1]
info['number']=re.search('"learn-number">(.*?)</em>',each,re.S).group(1)

f = open('info.txt', 'a')
f.writelines('title: '+info['title']+'\n')
f.writelines('detail: ' + info['detail'] + '\n')
f.writelines('date: ' + info['date'] + '\n')
f.writelines('class: ' + info['class'] + '\n')
f.writelines('number: ' + info['number'] + '\n\n')
f.close()
blog_20160606_7_5581269 71行 Python
Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
#-*_coding:utf8-*-
import requests
import re

# Windows下默认编码是GBK,网页默认编码是utf-8,编码不匹配易导致爬取内容乱码
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class spider(object):
def __init__(self):
print u'开始爬取内容。。。'

#getsource用来获取网页源代码
def getsource(self,url):
html = requests.get(url)
return html.text

#changepage用来生产不同页数的链接
def changepage(self,url,total_page):
now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1))
page_group = []
for i in range(now_page,total_page+1): # 左闭右开
link = re.sub('pageNum=\d+','pageNum=%s'%i,url,re.S)
page_group.append(link)
return page_group

#geteveryclass用来抓取每个课程块的信息
def geteveryclass(self,source):
everyclass = re.findall('(<li id=.*?</li>)',source,re.S)
return everyclass

#getinfo用来从每个课程块中提取出我们需要的信息
def getinfo(self,eachclass):
info = {} # 字典
info['title'] = re.search('lessonimg" title="(.*?)" alt',eachclass,re.S).group(1)
info['content'] = re.search('display: none;">(.*?)</p>',eachclass,re.S).group(1)
timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
info['classtime'] = timeandlevel[0]
info['classlevel'] = timeandlevel[1]
info['learnnum'] = re.search('"learn-number">(.*?)</em>',eachclass,re.S).group(1)
return info

#saveinfo用来保存结果到info.txt文件中
def saveinfo(self,classinfo):
f = open('info.txt','a')
#追加方式打开
for each in classinfo:
f.writelines('title:' + each['title'] + '\n')
f.writelines('content:' + each['content'] + '\n')
f.writelines('classtime:' + each['classtime'] + '\n')
f.writelines('classlevel:' + each['classlevel'] + '\n')
f.writelines('learnnum:' + each['learnnum'] +'\n\n')
f.close()

#####程序入口
#如果是程序自己使用jikexueyuan.py,则__name__的值是__main__
if __name__ == '__main__':

classinfo = [] # 空列表,用以保存课程信息
url = 'http://www.jikexueyuan.com/course/?pageNum=1'
jikespider = spider() # 实例化spider()类
all_links = jikespider.changepage(url,20)
for link in all_links:
print u'正在处理页面:' + link
html = jikespider.getsource(link)
everyclass = jikespider.geteveryclass(html)
for each in everyclass:
info = jikespider.getinfo(each)
classinfo.append(info)
jikespider.saveinfo(classinfo)
blog_20160606_8_979463 3行 Python
Raw
 1
 2
 3
import sys
reload(sys)
sys.setdefaultencoding("utf-8")