[技术分享]Python3 网页爬取实战应用

stclair2201发布于16 天前 • 267 次阅读

这几天在做公司的生态在线加油服务,由于从外部资源中无法找到各个加油站准确的油价信息,转而想办法去寻找各自区域的油价,相当大多场景下与真实油价还是比较接近的。 所以最后确定去油价平台获取最新的各个区域油价数据。使用Python 开发,

import urllib.request
import re
import json
import time
import os

dataDir = "./data/"+time.strftime('%Y-%m-%d', time.localtime())+"/"
logDir = dataDir + "log/"

def getOtherOilType():
    otherOilType = ["93","97"]
    result = []
    for oil in otherOilType:
        htmlCode = getUrlContent('http://youjia.chemcp.com/'+oil+"haoqiyoujiage.asp")
        htmlCode = htmlCode.replace(' ' , '').replace("\r\n" , '')
        reg = r'<tdbgcolor="#FFFFFF"><ahref="/(.*?)/"target="_blank">(.*?)</a></td><tdbgcolor="#FFFFFF">(.*?)</td><tdbgcolor="#FFFFFF">(.*?)</td><tdbgcolor="#FFFFFF">(.*?)</td>'
        reg = re.compile(reg)
        prices = re.findall(reg, htmlCode)
        for price in prices:
            result.append({
                'url':price[0],
                'province':price[1],
                '_'+oil:price[2]
            })
    text = json.dumps(result)
    f = open(dataDir+"____9397.txt",'w')
    f.write(text)
    f.close()


def getProvinceList():
    htmlCode = getUrlContent('http://youjia.chemcp.com/')
    reg = r'<td bgcolor="#FFFFFF"><a href="http://youjia.chemcp.com/(.*?)" target="_blank">(.*?)</a></td>'
    reg = re.compile(reg)
    provices = re.findall(reg, htmlCode)
    for pro in provices:
        profile = pro[0]
        profile = profile.replace("/","")
        profile = dataDir +profile+".txt"
        if not os.path.exists(profile):
            proPrice = getProvince(pro)
            proresult= {
                "url":pro[0].replace("/",""),
                "name":pro[1],
                "_90":proPrice["_90"],
                "_92":proPrice["_92"],
                "_95":proPrice["_95"],
                "_0":proPrice["_0"],
                "data":proPrice["data"]
            }
            text = json.dumps(proresult)
            f = open(profile,'w')
            f.write(text)
            f.close()

def getProvince(pro):
    proUrl = pro[0]
    htmlCode = getUrlContent('http://youjia.chemcp.com/'+proUrl)
    reg = r'<font color="red">(.*?)元/升</font>'
    reg = re.compile(reg)
    provicePrice = re.findall(reg, htmlCode)
    data = {
        "_90":provicePrice[0],
        "_92":provicePrice[1],
        "_95":provicePrice[2],
        "_0":provicePrice[3],
        "data":[]
    }
    reg = r'<a href="/'+proUrl+'(.*?).html" target="_blank">(.*?)今日油价</a>'
    reg = re.compile(reg)
    cities = re.findall(reg, htmlCode)
    for city in cities:
        if not city[0] == "hamidiqu":
            cityPrice = getCity({"curl":city[0],"purl":proUrl})
            data["data"].append({
                "url":city[0],
                "name":city[1],
                "_90":cityPrice["_90"],
                "_92":cityPrice["_92"],
                "_95":cityPrice["_95"],
                "_0":cityPrice["_0"],
                "data":cityPrice["data"]
            })
    return data
    

def getCity(url):
    print("----------------------------------")
    print("城市:"+url["curl"])
    print("----------------------------------")
    htmlCode = getUrlContent('http://youjia.chemcp.com/'+url['purl']+url['curl']+'.html')
    reg = r'<font color="red">(.*?)元/升</font>'
    reg = re.compile(reg)
    cityPrice = re.findall(reg, htmlCode)
    data = {
        "_90":cityPrice[0],
        "_92":cityPrice[1],
        "_95":cityPrice[2],
        "_0":cityPrice[3],
        "data":[]
    }
    reg = r'<tdbgcolor="#FFFFFF">(.*?)今日油价</td><tdbgcolor="#FFFFFF">(.*?)</td><tdbgcolor="#FFFFFF">(.*?)</td><tdbgcolor="#FFFFFF">(.*?)</td><tdbgcolor="#FFFFFF">(.*?)</td>'
    reg = re.compile(reg)
    htmlCode = htmlCode.replace(' ' , '').replace("\r\n" , '')
    areas = re.findall(reg,htmlCode)
    for a in areas:
        data["data"].append({
            "url":a[0],
            "name":a[0],
            "_90":a[1],
            "_92":a[2],
            "_95":a[3],
            "_0":a[4],
        })
    return data


def getUrlContent(url):
    print(url)
    page = urllib.request.urlopen(url,None,500)
    htmlCode = page.read()
    htmlCode = htmlCode.decode("gbk")
    return htmlCode


def clawer():
    okfile = logDir+"OK.txt"
    if not os.path.exists(okfile):
        getProvinceList()
        print("获取 93,97 号省级油价")
        getOtherOilType()
    f = open(okfile,'w')
    f.write("OK")
    f.close()

if __name__ == '__main__':
    try:
        os.makedirs(logDir,775)
    except:
        print("日志目录已存在!")
    clawer()

上效果图:

image image

共收到 4 条回复
y

y

y

收藏一波

y

😄😄😄