request爬取energyplus天气数据并整理二维字典

2021-08-02

Word count: 564 | Reading time≈ 2 min

爬虫模块

思路以lxml模块为主进行网页解析。用zipfile和tempfile对下载到的zip文件进行解压。
运行代码会在当前文件夹下创建data，并在里面下载所有的天气数据。

值得注意的是，网站存在访问限制，由于有代理服务器，所以没有加sleep，也没有测试网站反爬虫的上限，如果没有代理谨慎使用，或者保守添加停等策略。

import requests
import zipfile
import tempfile
from lxml import etree
 
def get_data(url):
    #url = "https://energyplus.net/weather-download/asia_wmo_region_2/CHN//CHN_Anhui.Huoshan.583140_CSWD/all"
    response = requests.get(url)
    return url, response.content
def unzip(filename,data):
    _tmp_file = tempfile.TemporaryFile()  # 创建临时文件
    #print(_tmp_file)
 
    _tmp_file.write(data)  # byte字节数据写入临时文件
    # _tmp_file.seek(0)
 
    zf = zipfile.ZipFile(_tmp_file, mode='r')
    for names in zf.namelist():
        f = zf.extract(names, './data/'+filename)  # 解压到data目录文件下
        print(f)
    zf.close()

if __name__ == '__main__':
    url_main = 'https://energyplus.net/weather-region/asia_wmo_region_2/CHN'
    response = requests.get(url_main)
    #print(response.content)
    html = etree.HTML(response.content)
    name_city = html.xpath('/html/body/div[2]/div/section/div/section/div/a/@href ')
    print(len(name_city))

    for i in range(len(name_city)):
        #print(name_i)
        s = name_city[i+315].split('/')
        # print("https://energyplus.net/weather-download/asia_wmo_region_2/CHN//"+s[-1]+"/all")
        url = "https://energyplus.net/weather-download/asia_wmo_region_2/CHN//"+s[-1]+"/all"
        if "CSWD" not in url:
            continue
        print(i+315)
        print(url)
        url, data = get_data(url)
        unzip(s[-1],data)
        #exit(0)
    # url = "https://energyplus.net/weather-download/asia_wmo_region_2/CHN//CHN_Anhui.Huoshan.583140_CSWD/all"
    # url, data = get_data(url)  # data为byte字节

更换三轮IP，爬完全部数据。如需其余国家的数据，修改url一行的路由即可。结果如图

数据整理模块

此处必须使用二维索引字典，因为第一索引是省份，第二索引是city名。二维索引字典增添键值对需要进行判断。具体详见代码。。。

import os

filePath = "./"
for _,d,_ in os.walk(filePath):
	break 
	# 很奇怪为什么是一个迭代对象？难道是迭代打开子目录？
def addtwodimdict(thedict, key_a, key_b, val): 
    if key_a in thedict:
        thedict[key_a].update({key_b: val})
    else:
        thedict.update({key_a:{key_b: val}})

main_dic = dict(dict())
for name in d:
	#print(name.split("."))
	province,city = name.split(".")[0][4:],name.split(".")[-2]
	#print(province,city)
	# if province in main_dic.keys():
	#main_dic[province][city] = name
	addtwodimdict(main_dic,province,city,name)

#print(main_dic)
# import json
# print(json.dumps(main_dic,indent = 4))

结果如图

Donate

Copyright： Copyright is owned by the author. For commercial reprints, please contact the author for authorization. For non-commercial reprints, please indicate the source.