1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
| import requests import re import xlwt
headers = {'User-Agent':"Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
book = xlwt.Workbook() sheet = book.add_sheet('ke_tengxun',cell_overwrite_ok=True)
rownum = 1 def get_info(url): global rownum res = requests.get(url,headers = headers) pattern = re.compile('.*?rel="nofollow".*?>(.*?)<',re.S) ke_chupin =re.findall(pattern,res.text) pattern = re.compile(' .*?cors-name="course">(.*?)<',re.S) ke_name =re.findall(pattern,res.text) pattern = re.compile(' .*?href="(.*?)".*?target',re.S) ke_href =re.findall(pattern,res.text) print(ke_chupin) print(len(ke_chupin)) print(ke_name) print(len(ke_name)) print(ke_href) print(len(ke_href))
head = ['课程','链接','出品人'] for h in range(len(head)): sheet.write(0,h,head[h]) for l in range(len(ke_name[:24])): sheet.write(rownum,0,ke_name[l]) sheet.write(rownum,1,ke_href[l]) sheet.write(rownum,2,ke_chupin[l]) rownum += 1
book.save('test.xls')
if __name__ == '__main__': for i in range(1,13): urls = 'https://ke.qq.com/course/list?mt=1001&st=2002&tt=3019&page={}'.format(i) get_info(urls)
|