爬取关键字写入表格

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import requests
import re
import xlwt

headers = {'User-Agent':"Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}


book = xlwt.Workbook()
sheet = book.add_sheet('ke_tengxun',cell_overwrite_ok=True)

rownum = 1
def get_info(url):
global rownum
res = requests.get(url,headers = headers)
pattern = re.compile('.*?rel="nofollow".*?>(.*?)<',re.S)
ke_chupin =re.findall(pattern,res.text)
pattern = re.compile('
.*?cors-name="course">(.*?)<',re.S)
ke_name =re.findall(pattern,res.text)
pattern = re.compile('
.*?href="(.*?)".*?target',re.S)
ke_href =re.findall(pattern,res.text)
print(ke_chupin)
print(len(ke_chupin))
print(ke_name)
print(len(ke_name))
print(ke_href)
print(len(ke_href))

head = ['课程','链接','出品人']
for h in range(len(head)):
sheet.write(0,h,head[h])
for l in range(len(ke_name[:24])):
sheet.write(rownum,0,ke_name[l])
sheet.write(rownum,1,ke_href[l])
sheet.write(rownum,2,ke_chupin[l])
rownum += 1

book.save('test.xls')

if __name__ == '__main__':
for i in range(1,13):
urls = 'https://ke.qq.com/course/list?mt=1001&st=2002&tt=3019&page={}'.format(i)
get_info(urls)