본문 바로가기
Data Analysis

예제 6. 유가데이터분석

by ram_ 2022. 12. 11.

STEP 01. TABLE + WEB CRAWLING

import mysql.connector
import pandas as pd

conn = mysql.connector.connect(
    host = "", 
    port = , 
    user = "", 
    password = "",
    database = ""
)

cur = conn.cursor(buffered=True)

# table 만들기
cur.execute("CREATE TABLE GAS_BRAND( id int AUTO_INCREMENT PRIMARY KEY, name varchar(16))")
cur.execute("CREATE TABLE GAS_STATION( id int AUTO_INCREMENT PRIMARY KEY, brand int , name varchar(64), city char(2), gu varchar(10), address varchar(128), gasoline int, diesel int, self boolean, car_wash boolean, charging_station boolean, car_maintenance boolean, convenience_store boolean, 24_hours boolean, lat decimal(16, 14), lng decimal(17, 14), FOREIGN KEY (brand) REFERENCES GAS_BRAND(id) )")
conn.close()

cur.execute("INSERT INTO GAS_BRAND VALUES (1, 'SK에너지'), (2, '현대오일뱅크'), (3, 'GS칼텍스'), (4, 'S-OIL'), (5, '알뜰주유소'), (6, '자가상표') ")
conn.commit()
conn.close()

 

# 문자형 숫자형 반환 함수 (테스트 '1,000')
def strTOint(str):

    return int(str.replace(',', ''))

testStr = '1,000'
strTOint(testStr)

# ID 반환하는 함수 (테스트 'SK에너지') 
def brandTOid(brandname):
    dict_cur = conn.cursor(dictionary=True)
    dict_cur.execute("SELECT * FROM GAS_BRAND")
    gas_brand = dict_cur.fetchall()
    
    for item in gas_brand:
        if item['name'] == brandname:
            return item['id']
    return -1

brandTOid('SK에너지')

# 주소 입력하면 구 이름 반환 함수 (테스트 '서울시 강남구 헌릉로 730')
def addressTOgu(myaddress):
    return myaddress.split()[1]

addressTOgu('서울시 강남구 헌릉로 730')

# 주소 받아 위도 경도 반환 함수 (테스트 '서울시 강남구 헌릉로 730')
def addressTOlatlng(myaddress):
    
    import googlemaps
    gmaps_key = ""
    gmaps = googlemaps.Client(key=gmaps_key)

    tmp = gmaps.geocode(myaddress, language='ko')
    lat = tmp[0].get("geometry")["location"]["lat"]
    lng = tmp[0].get("geometry")["location"]["lng"]

    return lat, lng

addressTOlatlng('서울시 강남구 헌릉로 730')

 

from selenium import webdriver
from bs4 import BeautifulSoup
import time
from tqdm import tqdm_notebook
from selenium.webdriver.common.by import By
import mysql.connector
import pandas as pd

conn = mysql.connector.connect(
    host = "", 
    port = , 
    user = "", 
    password = "",
    database = ""
)
cur = conn.cursor(buffered=True)

#ERROR 대응 code
cur.execute("SET FOREIGN_KEY_CHECKS=0;")

 

# url 접근
url = "https://www.opinet.co.kr/searRgSelect.do"
driver = webdriver.Chrome("../driver/chromedriver") 
driver.get(url)
time.sleep(2)
driver.get(url)
time.sleep(2)

# 시/도 클릭
sido = driver.find_element(By.CSS_SELECTOR, "#SIDO_NM0")
sido.click()
# 서울 클릭
sido_list = sido.find_elements(By.CSS_SELECTOR, 'option')
seoul = driver.find_element(By.CSS_SELECTOR, '#SIDO_NM0 > option:nth-child(2)')
seoul.click()

# 부가정보 데이터 타입
## 1. 세차장 //*[@id="CWSH_YN"]
wash = driver.find_element(By.XPATH,'//*[@id="CWSH_YN"]')
wash.click()
## 2. 경정비 //*[@id="MAINT_YN"]
maintenance = driver.find_element(By.XPATH,'//*[@id="MAINT_YN"]')
maintenance.click()
## 3. 편의점 //*[@id="CVS_YN"]
conve = driver.find_element(By.XPATH,'//*[@id="CVS_YN"]')
conve.click()
## 4. 24시간 //*[@id="SEL24_YN"]
twofourH = driver.find_element(By.XPATH,'//*[@id="SEL24_YN"]')
twofourH.click()

# 시/군/구 
gu_list_raw = driver.find_element_by_id("SIGUNGU_NM0").click()
gu_list_raw = driver.find_element_by_id("SIGUNGU_NM0")      # 부모 태그
gu_list = gu_list_raw.find_elements_by_tag_name("option")   # 자식 태그
gu_list_raw = driver.find_element_by_id("SIGUNGU_NM0").click()

# 구 리스트
gu_names = []
for option in gu_list:
    gu_names.append(option.get_attribute("value"))
gu_names = gu_names[1:]

# 조회  //*[@id="searRgSelect"]/span
#searButton = driver.find_element(By.XPATH,'//*[@id="searRgSelect"]/span')
#searButton.click()

 

STEP 02. INSERT DATA

# 데이터 가져오기

sql  = "INSERT INTO GAS_STATION (brand, name, city, gu, address, gasoline, diesel, self, car_wash, charging_station, car_maintenance, convenience_store, 24_hours, lat, lng) \
     VALUES (%s, %s, '서울', %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) "

for gu in tqdm_notebook(gu_names):
    sendGu = driver.find_element(By.ID,"SIGUNGU_NM0")
    sendGu.send_keys(gu)
    time.sleep(1)
    req = driver.page_source
    soup = BeautifulSoup(req, "html.parser")
    cnt = len(soup.select("#body1 > tr"))
    time.sleep(1)
    
    for num in range(cnt):
          # 이부분에서 f'num +1' 처리를 안해줬어서 같은 정보가 cnt개 저장되는 실수가 있었다.
          targetStation = f"#body1 > tr:nth-child({num+1}) > td.rlist > a"
          driver.find_element(By.CSS_SELECTOR,targetStation).click()

          time.sleep(1)
          url01 = driver.page_source
          soup1 = BeautifulSoup(url01, "html.parser")
          
          name = soup1.select_one("#os_nm").text
          address = soup1.select_one("#rd_addr").text
          brand = brandTOid(soup1.select_one("#poll_div_nm").text)           
          gasoline = strTOint(soup1.select_one("#b027_p").text)
          diesel = strTOint(soup1.select_one("#d047_p").text)
          self_bool =True if soup1.select_one("#self_icon") else False
          car_wash = True if soup1.select_one("#cwsh_yn").get("src")[-7:-4] != "off" else False
          charging_station = True if soup1.select_one("#lpg_yn").get("src")[-7:-4] != "off" else False
          car_maintenance = True if soup1.select_one("#maint_yn").get("src")[-7:-4] != "off" else False
          convenience_store = True if soup1.select_one("#cvs_yn").get("src")[-7:-4] != "off" else False
          is_24hour = True if soup1.select_one("#sel24_yn").get("src")[-7:-4] != "off" else False
        
          gu = addressTOgu(address)
          lat, lng = addressTOlatlng(address)

          # 데이터 추가
          cur.execute(sql, (brand, name, gu, address, gasoline, diesel, self_bool, car_wash, charging_station, car_maintenance, convenience_store, is_24hour, lat, lng))
          conn.commit()

driver.quit()
cur.execute("select * from GAS_STATION LIMIT 10")
result = cur.fetchall()
pd.DataFrame(result)
# 아이디 대신 브랜드 명 / 정렬 아이디 순 / 
sql = "select s.id, b.name brand, s.name, s.city, s.gu, s.address, s.gasoline, s.diesel, s.self, s.car_wash, s.charging_station, s.car_maintenance, s.convenience_store, s.24_hours, s.lat, s.lng \
    from GAS_BRAND b, GAS_STATION s \
    where b.id = s.brand \
    order by s.id"

cur.execute(sql)
result = cur.fetchall()

num_fields = len(cur.description)
field_names = [i[0] for i in cur.description]

 

df = pd.DataFrame(result)
df.columns = field_names

df.to_csv("[DS].csv", index=False, encoding='utf-8')
df.head()

STEP 03. TEST

# 미왕빌딩 주소 서울특별시 강남구 역삼동 826-21
m_lat, m_lng = addressTOlatlng("서울특별시 강남구 강남대로 364")
m_lat, m_lng

# 거리 측정 명령어
sql = "select s.id, b.name, s.name, s.address, (6371*acos(cos(radians(37.4955366))*cos(radians(s.lat))*cos(radians(s.lng)-radians(127.0293521))+sin(radians(37.4955366))*sin(radians(s.lat)))) distance \
    from GAS___STATION s, GAS__BRAND b \
    where s.brand = b.id \
    HAVING distance <= 1 \
    order by distance"
    
cur.execute(sql)
result = cur.fetchall()

print(result)

-> 미왕빌딩에서 1km 이내에 위치한 주유소 정보를 검색하세요.

 

sql = "select s.id, b.name, s.name, s.address, (6371*acos(cos(radians(37.4955366))*cos(radians(s.lat))*cos(radians(s.lng)-radians(127.0293521))+sin(radians(37.4955366))*sin(radians(s.lat)))) distance \
    from GAS___STATION s, GAS__BRAND b \
    where s.brand = b.id AND self=1 AND 24_hours=1 AND convenience_store=1\
    order by distance \
    limit 10 "
    
cur.execute(sql)
result = cur.fetchall()

print(result)

-> 미왕빌딩에서 셀프주유가 가능하고 24시간이면서 편의점이 있는 가장 가까운 주유소 1-개를 휘발유 가격이 저렴한 순으로 정렬하여 조회하세요. 

=> 포스팅하면서 발견한 내용인데, 검색에는 성공했으나 저렴한 순으로 나열하는 것을 까먹었다 .. 

 

sql = "select s.gu, b.name, avg(s.gasoline) \
        from GAS___STATION s, GAS__BRAND b \
        where b.id = s.brand \
        group by s.gu, b.name \
        order by s.gu, avg(s.gasoline)"
        
cur.execute(sql)
result = cur.fetchall()
pd.DataFrame(result)

-> 구별로 주유소 브랜드 별 휘발유 가격을 조회하여 저렴한 순으로 출력하세요

 

 

 


import mysql.connector
import pandas as pd

conn = mysql.connector.connect(
    host = "", 
    port = , 
    user = "", 
    password = "",
    database = ""
)
cur = conn.cursor(buffered=True)
cur.execute("SET FOREIGN_KEY_CHECKS=1;")

error때문에 세팅해주었던 값을 다시 풀어준다.