import requests from pyquery import PyQuery as pq import pandas as pd import random import time from lxml import etree import json from pandas.core.frame import DataFrame
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
defscrape_index(url): url = 'https://music.163.com/discover/playlist/?order=hot&cat=%E5%8D%8E%E8%AF%AD&limit=35&offset=1' print(url) response = requests.get(url,headers = headers) html = etree.HTML(response.content) name_list = html.xpath('/html/body/div[3]/div/ul/li/div/a/@href') #print(response.text) return name_list defget_music(): list_01 = [] url = 'https://music.163.com/discover/playlist/?order=hot&cat=%E5%8D%8E%E8%AF%AD&limit=35&offset={page}' for page inrange(1,2): # 跑一页试试,如果跑全部,改为 range(0,1295,35) url1 = url.format(page = page) list = [] #print(url1) for i in scrape_index(url1): # generator 遍历之后的i的类型仍然是qyquery类型 #i_url = i.attr('href') # attr 方法来获取属性 ''' 获取歌单和评论均用了网易云音乐get请求的API,快速高效! 网易云歌单API https://music.163.com/api/playlist/detail?id={歌单ID} 热评获取API http://music.163.com/api/v1/resource/comments/R_SO_4_{歌曲ID}?limit=20&offset=0 '''#https://music.163.com/playlist?id=564322156 detail_url = 'https://music.163.com'+i #获取的url还需要替换一下符合API要求的格式 list.append(detail_url) list_01.extend(list) # extend 对列表合并 #time.sleep(5+random.random()) # 文明爬虫 #print(list_01) return list_01 defget_comment(music_list): re = [] for l in music_list[:10]: print(l) #l = 'https://music.163.com/playlist?id=2900343697' response = requests.get(l,headers = headers)#/html/body/div[3]/div[1]/div/div/div[2]/div[2]/ul/li[1]/a html = etree.HTML(response.content)#/html/body/div[3]/div[1]/div/div/div[2]/div[2]/div/div[1]/table/tbody/tr[3]/td[2]/div/div/div/span/a name_list = html.xpath('/html/body/div[3]/div[1]/div/div/div[2]/div[2]/ul/li/a/@href') ans = [] for s in name_list: url = 'https://music.163.com/api/v1/resource/comments/R_SO_4_'+s.split("id=")[-1] res = json.loads(requests.get(url,headers = headers).text) for j in res['hotComments']: ans.append(j['content'].replace('\n', ' ').replace('\r', ' ')) re.append(ans) return re
if __name__ == '__main__': ans = get_comment(get_music()) #print(ans) a = DataFrame(ans) a.to_csv('test.csv',encoding="utf_8_sig") for i inrange(len(ans)): for j inrange(len(ans[i])): print(ans[i][j])
会存到本地的test.csv地下,没有加延迟,建议挂代理爬取。
Donate
Copyright:
Copyright is owned by the author. For commercial reprints, please contact the author for authorization. For non-commercial reprints, please indicate the source.