'연구 노트' 카테고리의 글 목록 (7 Page)

import keras
print(keras.__version__)
import tensorflow as tf
print(tf.__version__)
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # for GPU_1

from __future__ import absolute_import, division, print_function, unicode_literals
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import pandas as pd
from download import download

import matplotlib.pyplot as plt
import seaborn as sns

2.7.0
2.7.0

In [2]:

# os.system("dir C:") #, shell=True)
# os.system("dir", shell=True)
file_asos = 'D:/dataset/my_data/ASOS_merg_2016-2021_136_Andong.csv'
file_aaos = 'D:/dataset/my_data/AAOS_merg_2016-2021_972_Ahndong.csv'
file_aq = 'D:/dataset/my_data/AQ_merge_2016-2021.csv'

asos = pd.read_csv(file_asos, encoding='UTF8')  # cp949  로 하면 error 발생
aaos = pd.read_csv(file_aaos, encoding='UTF8')  # cp949 로 하면 Date 포맷이 asos 와 불일치 하므로 UTF-8로 고정
aq = pd.read_csv(file_aq, encoding='cp949') #, index=0)
## 만약 에러가 발생하면, UTF8, cp949를 바꿔서 몇번 해 보면(?) 에러가 사라진다.

In [3]:

asos.head()

Out[3]:

Site_NoSiteDateTair_CTair_flagRain_mmRain_flagWS_m_sWS_flagWD_16deg...Tsfc_CTsfc_flagT5cm_CT10cm_CT20cm_CT30cm_CYearMonthDayHour01234

136	안동	2016-01-01 00:00:00	-3.2	NaN	9.0	0.7	140.0	...	-5.1	NaN	NaN	NaN	NaN	2016	1	1	0
136	안동	2016-01-01 01:00:00	-3.9	NaN	NaN	0.8	140.0	...	-5.4	NaN	NaN	NaN	NaN	2016	1	1	1
136	안동	2016-01-01 02:00:00	-5.3	NaN	NaN	0.2	0.0	...	-6.2	NaN	NaN	NaN	NaN	2016	1	1	2
136	안동	2016-01-01 03:00:00	-4.8	NaN	NaN	1.1	140.0	...	-6.2	NaN	NaN	NaN	NaN	2016	1	1	3
136	안동	2016-01-01 04:00:00	-6.3	NaN	NaN	0.4	0.0	...	-6.9	NaN	NaN	NaN	NaN	2016	1	1	4

5 rows × 42 columns

In [4]:

if asos['Site'][1] == '안동': 
    site_name = 'Andong'    
site_name

Out[4]:

'Andong'

In [5]:

asos.drop(['Site_No', 'Site', 'Tair_flag', 'WS_flag', 'WD_flag', 'RH_flag',
           'Pa_flag', 'Psfc_flag','Suntime_flag','Sunrad_flag',
           'Snow_cm', 'Snow_3hr_cm','Sfc_stat','Tsfc_flag',
           'Cloud_cover_total','Cloud_cover_mid_low','Cloud_type','Cloud_BH_100m',
           'T5cm_C','T10cm_C','T20cm_C','T30cm_C',
           'Year', 'Month', 'Day', 'Hour'], inplace =True, axis=1)
asos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52608 entries, 0 to 52607
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          52608 non-null  object 
 1   Tair_C        52606 non-null  float64
 2   Rain_mm       4373 non-null   float64
 3   Rain_flag     10012 non-null  float64
 4   WS_m_s        52608 non-null  float64
 5   WD_16deg      52608 non-null  float64
 6   RH_pct        52608 non-null  float64
 7   Pv_hPa        52604 non-null  float64
 8   Td_C          52602 non-null  float64
 9   Pa_hPa        52604 non-null  float64
 10  Psfc_hPa      52605 non-null  float64
 11  Suntime_hr    28720 non-null  float64
 12  Sunrad_MJ_m2  28756 non-null  float64
 13  Vis_10m       52606 non-null  float64
 14  Phen          15943 non-null  float64
 15  Tsfc_C        52599 non-null  float64
dtypes: float64(15), object(1)
memory usage: 6.4+ MB

In [6]:

aaos.drop(['Site_No', 'Site', 
           'Tsoil_20cm','Tsoil_30cm','Tsoil_0.5m','Tsoil_1.5m', 'Tsoil_3.0m', 'Tsoil_5.0m', 
           'Watrlev_cm'
           ], inplace=True, axis=1)
aaos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51385 entries, 0 to 51384
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           51385 non-null  object 
 1   RH_0.5m        24687 non-null  float64
 2   RH_1.5m        51376 non-null  float64
 3   Msoil_10cm     50466 non-null  float64
 4   Msoil_20cm     50466 non-null  float64
 5   Msoil_30cm     50466 non-null  float64
 6   Msoil_50cm     50466 non-null  float64
 7   Tair_0.5m      24692 non-null  float64
 8   Tair_1.5m      51378 non-null  float64
 9   Tair_4.0m      24691 non-null  float64
 10  WS_1.5m        24510 non-null  float64
 11  WS_4.0m        24510 non-null  float64
 12  Tsfc_0m        51384 non-null  float64
 13  Tgrass_0m      50413 non-null  float64
 14  Tsoil_5cm      51385 non-null  float64
 15  Tsoil_10cm     51371 non-null  float64
 16  Tsoil_1.0m     51383 non-null  float64
 17  Radnet_MJ_m2   24694 non-null  float64
 18  Radglob_MJ_m2  24694 non-null  float64
 19  Radrefl_MJ_m2  24694 non-null  float64
 20  Illum_10lux    23683 non-null  float64
 21  Year           51385 non-null  int64  
 22  Month          51385 non-null  int64  
 23  Day            51385 non-null  int64  
 24  Hour           51385 non-null  int64  
dtypes: float64(20), int64(4), object(1)
memory usage: 9.8+ MB

In [7]:

aq = aq[['Datetime', 'SO2', 'NO2', 'O3', 'PM10', 'PM25']]
aq.rename(columns = {'Datetime':'Date'}, inplace =True)
aq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50400 entries, 0 to 50399
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    50400 non-null  object 
 1   SO2     47034 non-null  float64
 2   NO2     45531 non-null  float64
 3   O3      46572 non-null  float64
 4   PM10    45919 non-null  float64
 5   PM25    33466 non-null  float64
dtypes: float64(5), object(1)
memory usage: 2.3+ MB

In [8]:

asos.head()

Out[8]:

DateTair_CRain_mmRain_flagWS_m_sWD_16degRH_pctPv_hPaTd_CPa_hPaPsfc_hPaSuntime_hrSunrad_MJ_m2Vis_10mPhenTsfc_C01234

2016-01-01 00:00:00	-3.2	NaN	9.0	0.7	140.0	83.0	4.0	-5.6	1015.4	1033.5	NaN	NaN	400.0	19.0	-5.1
2016-01-01 01:00:00	-3.9	NaN	NaN	0.8	140.0	85.0	3.9	-6.0	1015.1	1033.2	NaN	NaN	300.0	19.0	-5.4
2016-01-01 02:00:00	-5.3	NaN	NaN	0.2	0.0	88.0	3.6	-6.9	1015.1	1033.3	NaN	NaN	300.0	19.0	-6.2
2016-01-01 03:00:00	-4.8	NaN	NaN	1.1	140.0	87.0	3.7	-6.6	1015.0	1033.2	NaN	NaN	300.0	19.0	-6.2
2016-01-01 04:00:00	-6.3	NaN	NaN	0.4	0.0	89.0	3.4	-7.8	1015.3	1033.6	NaN	NaN	300.0	19.0	-6.9

In [9]:

aaos.head()

Out[9]:

DateRH_0.5mRH_1.5mMsoil_10cmMsoil_20cmMsoil_30cmMsoil_50cmTair_0.5mTair_1.5mTair_4.0m...Tsoil_10cmTsoil_1.0mRadnet_MJ_m2Radglob_MJ_m2Radrefl_MJ_m2Illum_10luxYearMonthDayHour01234

2016-01-01 00:00:00	96.3	98.7	7.0	9.9	15.2	15.8	-4.1	-3.6	-4.9	...	-1.1	6.8	5.65	10.63	4.98	578.0	2016	1	1	0
2016-01-01 01:00:00	97.2	99.8	6.5	10.0	15.2	15.8	-5.0	-4.5	-5.9	...	-1.4	6.8	-0.16	0.00	0.16	568.0	2016	1	1	1
2016-01-01 02:00:00	97.5	97.9	6.0	10.0	15.2	15.8	-5.3	-4.8	-6.4	...	-1.8	6.7	-0.32	0.02	0.34	607.0	2016	1	1	2
2016-01-01 03:00:00	97.4	98.7	5.7	10.1	15.2	15.8	-5.8	-5.5	-6.7	...	-2.1	6.7	-0.44	0.07	0.51	597.0	2016	1	1	3
2016-01-01 04:00:00	96.3	96.4	5.4	10.1	15.2	15.8	-6.4	-5.9	-7.5	...	-2.5	6.7	-0.55	0.14	0.69	579.0	2016	1	1	4

5 rows × 25 columns

In [10]:

aq.head()

Out[10]:

DateSO2NO2O3PM10PM2501234

2016-01-01 01:00:00	0.001	0.026	0.003	75.0	NaN
2016-01-01 02:00:00	0.001	0.022	0.003	81.0	NaN
2016-01-01 03:00:00	0.001	0.021	0.003	77.0	NaN
2016-01-01 04:00:00	0.001	0.019	0.003	73.0	NaN
2016-01-01 05:00:00	0.001	0.018	0.003	73.0	NaN

In [11]:

print(len(asos))
print(len(aaos))
print(len(aq))

52608
51385
50400

In [ ]:

In [12]:

### ASOS 기준으로 AAOS  합치기.  Datetime 컬럼 기준
# filename_out = "ASOSAQ_filled_" + "_" + str(site_name) + ".csv"
tmp = pd.merge(asos, aaos, how="outer")   #주의 !! on='Date'를 사용하면 Date 51385 로 자료 없는 곳은 빠짐.
print(len(tmp))
tmp.head()
tmpfile='D:/dataset/my_data/mrg_asos_aaos_' + str(site_name) + '.csv'
tmp.to_csv(tmpfile, header=True, index=False)

In [13]:

print(len(tmp))

In [14]:

### 위에서 합친 ASOS-AAOS 기준으로 다시 AQ와 합치기  Datetime 컬럼 기준
mrg = pd.merge(tmp, aq, on='Date')
mrg.head()

Out[14]:

DateTair_CRain_mmRain_flagWS_m_sWD_16degRH_pctPv_hPaTd_CPa_hPa...Illum_10luxYearMonthDayHourSO2NO2O3PM10PM2501234

2016-01-01 01:00:00	-3.9	NaN	NaN	0.8	140.0	85.0	3.9	-6.0	1015.1	...	568.0	2016.0	1.0	1.0	1.0	0.001	0.026	0.003	75.0	NaN
2016-01-01 02:00:00	-5.3	NaN	NaN	0.2	0.0	88.0	3.6	-6.9	1015.1	...	607.0	2016.0	1.0	1.0	2.0	0.001	0.022	0.003	81.0	NaN
2016-01-01 03:00:00	-4.8	NaN	NaN	1.1	140.0	87.0	3.7	-6.6	1015.0	...	597.0	2016.0	1.0	1.0	3.0	0.001	0.021	0.003	77.0	NaN
2016-01-01 04:00:00	-6.3	NaN	NaN	0.4	0.0	89.0	3.4	-7.8	1015.3	...	579.0	2016.0	1.0	1.0	4.0	0.001	0.019	0.003	73.0	NaN
2016-01-01 05:00:00	-6.8	NaN	NaN	0.4	0.0	89.0	3.3	-8.3	1014.8	...	555.0	2016.0	1.0	1.0	5.0	0.001	0.018	0.003	73.0	NaN

5 rows × 45 columns

In [16]:

tmpfile='D:/dataset/my_data/mrg_asos_aaos_aq_' + str(site_name) + '.csv'
mrg.to_csv(tmpfile, header=True, index=False)

In [17]:

tmp2 = pd.read_csv(tmpfile)
tmp2.head()

Out[17]:

DateTair_CRain_mmRain_flagWS_m_sWD_16degRH_pctPv_hPaTd_CPa_hPa...Illum_10luxYearMonthDayHourSO2NO2O3PM10PM2501234

2016-01-01 01:00:00	-3.9	NaN	NaN	0.8	140.0	85.0	3.9	-6.0	1015.1	...	568.0	2016.0	1.0	1.0	1.0	0.001	0.026	0.003	75.0	NaN
2016-01-01 02:00:00	-5.3	NaN	NaN	0.2	0.0	88.0	3.6	-6.9	1015.1	...	607.0	2016.0	1.0	1.0	2.0	0.001	0.022	0.003	81.0	NaN
2016-01-01 03:00:00	-4.8	NaN	NaN	1.1	140.0	87.0	3.7	-6.6	1015.0	...	597.0	2016.0	1.0	1.0	3.0	0.001	0.021	0.003	77.0	NaN
2016-01-01 04:00:00	-6.3	NaN	NaN	0.4	0.0	89.0	3.4	-7.8	1015.3	...	579.0	2016.0	1.0	1.0	4.0	0.001	0.019	0.003	73.0	NaN
2016-01-01 05:00:00	-6.8	NaN	NaN	0.4	0.0	89.0	3.3	-8.3	1014.8	...	555.0	2016.0	1.0	1.0	5.0	0.001	0.018	0.003	73.0	NaN

5 rows × 45 columns

728x90

저작자표시 비영리 변경금지

파이썬 전처리 :: ASOS 전처리(1) - 사용가능 변수 살피기

airmaster 2022. 7. 5. 17:59

2022. 7. 5. 17:59

728x90

ASOS_pre0_allsite_with_figs_OK.py

ASOS 전처리 Step 1

1) 기상데이터 포털에서 각 연도별 OBS_ASOS_TIM_XXXX.csv 다운로드

2) 각 사이트별로 변수 그림 그리고, 값이 존재하는 사용 가능한 변수명 확인

In [1]:

import keras
print(keras.__version__)
import tensorflow as tf
print(tf.__version__)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from __future__ import absolute_import, division, print_function, unicode_literals
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import pandas as pd
from download import download

import matplotlib.pyplot as plt
import seaborn as sns


mpl.rcParams['figure.figsize'] = (8,6)
mpl.rcParams['axes.grid'] = False

2.7.0
2.7.0

(1) OBS_ASOS_TIM_comb.csv 파일 읽기

변수명 영문으로 변환

In [24]:

dir_in = "D:/dataset/ASOS"
dir_out = "D:/dataset/my_data"
filename_in = "OBS_ASOS_TIM_comb.csv"
# filename_out = "OBS_ASOS_2015_136.csv"
# print(os.path.isdir(dir)); print(os.path.isfile(os.path.join(dir,file)))
infile = os.path.join(dir_in,filename_in)
# outfile = os.path.join(dir_out, filename_out)
print(infile)
# din = pd.read_csv(infile, encoding='cp949', low_memory=False)   #  자료형이 섞여 있어서 low_memory=False 해 줘야 된다. 
din = pd.read_csv(infile, encoding='UTF8',low_memory=False)   # 'cp949' 에러나면 UTF8으로

D:/dataset/ASOS\OBS_ASOS_TIM_comb.csv

In [18]:

din.head()

Out[18]:

Site_NoSiteDateTair_CTair_flagRain_mmRain_flagWS_m_sWS_flagWD_16deg...Cloud_BH_100mVis_10mSfc_statPhenTsfc_CTsfc_flagT5cm_CT10cm_CT20cm_CT30cm_C01234

90	속초	2015-01-01 0:00	NaN	NaN	NaN	NaN	3.5	290.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
90	속초	2015-01-01 1:00	-5.0	0.0	NaN	NaN	3.9	270.0	...	NaN	NaN	NaN	NaN	-3.4	0.0	NaN	NaN	NaN	NaN
90	속초	2015-01-01 2:00	-5.6	0.0	NaN	NaN	2.7	320.0	...	NaN	NaN	NaN	NaN	-3.9	0.0	NaN	NaN	NaN	NaN
90	속초	2015-01-01 3:00	-6.2	0.0	NaN	NaN	2.1	270.0	...	8.0	2000.0	0.0	NaN	-4.3	0.0	NaN	NaN	NaN	NaN
90	속초	2015-01-01 4:00	-6.5	0.0	NaN	NaN	1.7	230.0	...	10.0	2000.0	NaN	NaN	-4.7	0.0	NaN	NaN	NaN	NaN

5 rows × 38 columns

In [25]:

## 컬럼명 영문으로 변경
din.columns = ['Site_No', 'Site', 'Date', 'Tair_C', 'Tair_flag', 'Rain_mm',
       'Rain_flag', 'WS_m_s', 'WS_flag', 'WD_16deg', 'WD_flag', 'RH_pct',
       'RH_flag', 'Pv_hPa', 'Td_C', 'Pa_hPa', 'Pa_flag', 'Psfc_hPa',
       'Psfc_flag', 'Suntime_hr', 'Suntime_flag', 'Sunrad_MJ_m2',
       'Sunrad_flag', 'Snow_cm', 'Snow_3hr_cm', 'Cloud_cover_total',
       'Cloud_cover_mid_low', 'Cloud_type', 'Cloud_BH_100m', 'Vis_10m',
       'Sfc_stat', 'Phen', 'Tsfc_C', 'Tsfc_flag', 'T5cm_C', 'T10cm_C',
       'T20cm_C', 'T30cm_C']
print(din.dtypes)

Site_No                  int64
Site                    object
Date                    object
Tair_C                 float64
Tair_flag              float64
Rain_mm                float64
Rain_flag              float64
WS_m_s                 float64
WS_flag                float64
WD_16deg               float64
WD_flag                float64
RH_pct                 float64
RH_flag                float64
Pv_hPa                 float64
Td_C                   float64
Pa_hPa                 float64
Pa_flag                float64
Psfc_hPa               float64
Psfc_flag              float64
Suntime_hr             float64
Suntime_flag           float64
Sunrad_MJ_m2           float64
Sunrad_flag            float64
Snow_cm                float64
Snow_3hr_cm            float64
Cloud_cover_total      float64
Cloud_cover_mid_low    float64
Cloud_type              object
Cloud_BH_100m          float64
Vis_10m                float64
Sfc_stat               float64
Phen                   float64
Tsfc_C                 float64
Tsfc_flag              float64
T5cm_C                 float64
T10cm_C                float64
T20cm_C                float64
T30cm_C                float64
dtype: object

In [26]:

print(din['Site_No'].unique())
print(din['Site'].unique())

[ 90  95  98  99 100 101 102 104 105 106 108 112 114 115 116 119 121 127
 129 130 131 133 135 136 137 138 140 143 146 152 155 156 159 162 165 168
 169 170 172 174 175 176 177 184 185 187 188 189 192 201 202 203 211 212
 216 217 221 226 232 235 236 238 243 244 245 247 248 251 252 253 254 255
 257 258 259 260 261 262 263 264 266 268 271 272 273 276 277 278 279 281
 283 284 285 288 289 294 295  93 239]
['속초' '철원' '동두천' '파주' '대관령' '춘천' '백령도' '북강릉' '강릉' '동해' '서울' '인천' '원주'
 '울릉도' '관악산' '수원' '영월' '충주' '서산' '울진' '청주' '대전' '추풍령' '안동' '상주' '포항' '군산'
 '대구' '전주' '울산' '창원' '광주' '부산' '통영' '목포' '여수' '흑산도' '완도' '고창' '순천'
 '진도(첨찰산)' '대구(기)' '홍성' '제주' '고산' '성산' '서귀포' '진주' '강화' '양평' '이천' '인제' '홍천'
 '태백' '정선군' '제천' '보은' '천안' '보령' '부여' '금산' '부안' '임실' '정읍' '남원' '장수' '고창군'
 '영광군' '김해시' '순창군' '북창원' '양산시' '보성군' '강진군' '장흥' '해남' '고흥' '의령군' '함양군'
 '광양시' '진도군' '봉화' '영주' '문경' '청송군' '영덕' '의성' '구미' '영천' '경주시' '거창' '합천' '밀양'
 '산청' '거제' '남해' '북춘천' '세종']

In [27]:

din['Site'].loc[din['Site_No'] == 93]

Out[27]:

828319     북춘천
828320     북춘천
828321     북춘천
828322     북춘천
828323     북춘천
          ... 
4996669    북춘천
4996670    북춘천
4996671    북춘천
4996672    북춘천
4996673    북춘천
Name: Site, Length: 46031, dtype: object

(2) 각 사이트별 관측 변수 그림 그리고, 사용가능한 변수 체크

In [28]:

subsite = din.loc[din['Site_No'] == SITE]
ncol=4; nrow=7
plt.figure(figsize=(20,30))
plt.subplot(nrow,ncol,1);plt.plot(subsite['Tair_C']); plt.title("Tair")
plt.subplot(nrow,ncol,2);plt.plot(subsite['Td_C']); plt.title("Td")
plt.subplot(nrow,ncol,3);plt.plot(subsite['Tsfc_C']); plt.title("Tsfc")
plt.subplot(nrow,ncol,4);plt.plot(subsite['T5cm_C']); plt.title("T5cm")
plt.subplot(nrow,ncol,5);plt.plot(subsite['T10cm_C']); plt.title("T10cm")
plt.subplot(nrow,ncol,6);plt.plot(subsite['T20cm_C']); plt.title("T20cm")
plt.subplot(nrow,ncol,7);plt.plot(subsite['T30cm_C']); plt.title("T30cm")
plt.subplot(nrow,ncol,8);plt.plot(subsite['RH_pct']); plt.title("RH")
plt.subplot(nrow,ncol,9);plt.plot(subsite['WS_m_s']); plt.title("WS")
plt.subplot(nrow,ncol,10);plt.plot(subsite['WD_16deg']); plt.title("WD")
plt.subplot(nrow,ncol,11);plt.plot(subsite['Rain_mm']); plt.title("Rain")
plt.subplot(nrow,ncol,12);plt.plot(subsite['Snow_cm']); plt.title("Snow")
plt.subplot(nrow,ncol,13);plt.plot(subsite['Snow_3hr_cm']); plt.title("Snow_3hr")
plt.subplot(nrow,ncol,14);plt.plot(subsite['Pa_hPa']); plt.title("Pa")
plt.subplot(nrow,ncol,15);plt.plot(subsite['Pv_hPa']); plt.title("Pv")
plt.subplot(nrow,ncol,16);plt.plot(subsite['Psfc_hPa']); plt.title("Psfc")
plt.subplot(nrow,ncol,17);plt.plot(subsite['Sunrad_MJ_m2'], color="red"); plt.title("Sunrad")
plt.subplot(nrow,ncol,18);plt.plot(subsite['Suntime_hr']); plt.title("Suntime")
plt.subplot(nrow,ncol,19);plt.plot(subsite['Cloud_cover_total']); plt.title("Cloud_cover")
plt.subplot(nrow,ncol,20);plt.plot(subsite['Cloud_cover_mid_low']); plt.title("Cloud_ML")
# plt.subplot(nrow,ncol,21);plt.plot(subsite['Cloud_type']); plt.title("Cloud_type")
plt.subplot(nrow,ncol,22);plt.plot(subsite['Cloud_BH_100m']); plt.title("Cloud_BH")
plt.subplot(nrow,ncol,23);plt.plot(subsite['Sfc_stat']); plt.title("Sfc")
plt.subplot(nrow,ncol,24);plt.plot(subsite['Phen']); plt.title("Phenomenon")
plt.subplot(nrow,ncol,25);plt.plot(subsite['Vis_10m']); plt.title("Vis")

print(din['Site'].loc[din['Site_No'] == SITE].unique())

['안동']

In [ ]:

728x90

저작자표시 비영리 변경금지

pd.merge 에서 how='outer' 와 on = 'date' 차이

airmaster 2022. 7. 5. 17:14

2022. 7. 5. 17:14

728x90

print(len(df))
date_data = pd.date_range(start='2016-01-01', end='2022-01-01',  freq='H')
dat  = date_data.to_list()
print(len(dat)-1)

52608

tmp = pd.merge(asos, aaos, how="outer")  
print(len(tmp))

52608

앞에서 만든 1시간 간격 날짜 데이터에 맞춰서 merge 됨.

tmp = pd.merge(asos, aaos, on="Date")  
print(len(tmp))

51385

#주의 !! on='Date'를 사용하면 Date 51385 로 자료 없는 곳은 생략하고 merge 됨.

728x90

저작자표시 비영리 변경금지

머신러닝 :: 분류 리포트 2-3hr 차 결과

airmaster 2022. 6. 29. 16:08

2022. 6. 29. 16:08

728x90

1차 데이터셋: qc_ASOS 데이터 (2-3시간 차이)

2차 데이터셋: mrg_ASOS_AAOS 데이터 (2-3시간 차이) x에 vis_log 가 없는 경우 y=flag

3차 데이터셋: mrg_ASOS_AAOS 데이터 (2-3시간 차이) x에 vis_log 를 추가하고 y=flag

데이터셋 주의 사항

vis_10m는 제외해야 된다.

(오로지 flag로만 비교할때 보다, vis_log가 남아 있을 때 결과가 약간 더 향상되는 경우가 있다. )

의사결정 나무

| 1차 2차 > 3차

---------------------------

precision | 1.0

recall | 0.50

f1-score | 0.53

support | 0.52

2차

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     10227
         1.0       0.60      0.66      0.63       294

    accuracy                           0.98     10521
   macro avg       0.79      0.82      0.81     10521
weighted avg       0.98      0.98      0.98     10521

3차

           precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     10227
         1.0       0.59      0.62      0.61       294

    accuracy                           0.98     10521
   macro avg       0.79      0.81      0.80     10521
weighted avg       0.98      0.98      0.98     10521

랜덤포레스트

| 1차 2차 < 3차

---------------------------

precision | 1.0

recall | 1.00

f1-score | 0.09

support | 0.16

2차

              precision    recall  f1-score   support

         0.0       0.97      1.00      0.99     10227
         1.0       1.00      0.07      0.13       294

    accuracy                           0.97     10521
   macro avg       0.99      0.53      0.56     10521
weighted avg       0.97      0.97      0.96     10521

3차

  precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     10227
         1.0       0.87      0.41      0.56       294

    accuracy                           0.98     10521
   macro avg       0.92      0.71      0.78     10521
weighted avg       0.98      0.98      0.98     10521

나이브 베이즈

| 1차 2차 < 3차

---------------------------

precision | 1.0

recall | 1.0

f1-score | 0.09

support | 0.16

2차

            precision    recall  f1-score   support

         0.0       0.97      1.00      0.99     10227
         1.0       1.00      0.07      0.13       294

    accuracy                           0.97     10521
   macro avg       0.99      0.53      0.56     10521
weighted avg       0.97      0.97      0.96     10521

3차

    precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     10227
         1.0       0.87      0.41      0.56       294

    accuracy                           0.98     10521
   macro avg       0.92      0.71      0.78     10521
weighted avg       0.98      0.98      0.98     10521

에이다 부스트

| 1차 2차 < 3차

---------------------------

precision | 1.0

recall | 0.75

f1-score | 0.54

support | 0.63

2차

            precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     10227
         1.0       0.75      0.57      0.65       294

    accuracy                           0.98     10521
   macro avg       0.87      0.78      0.82     10521
weighted avg       0.98      0.98      0.98     10521

3차

     precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     10227
         1.0       0.76      0.59      0.66       294

    accuracy                           0.98     10521
   macro avg       0.87      0.79      0.83     10521
weighted avg       0.98      0.98      0.98     10521

그레디언트 부스트 (시간 걸림)

| 1차 2차 > 3차

---------------------------

precision | 1.0

recall | 0.38

f1-score | 0.03

support | 0.06

2차

           precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     10227
         1.0       0.78      0.33      0.47       294

    accuracy                           0.98     10521
   macro avg       0.88      0.67      0.73     10521
weighted avg       0.98      0.98      0.97     10521

3차

         precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     10227
         1.0       0.80      0.32      0.46       294

    accuracy                           0.98     10521
   macro avg       0.89      0.66      0.72     10521
weighted avg       0.98      0.98      0.97     10521

스태킹

| 1차 2차 = 3차.

---------------------------

precision | 1.0

recall | 0.70

f1-score | 0.56

support | 0.62

2차

             precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     10227
         1.0       0.75      0.55      0.64       294

    accuracy                           0.98     10521
   macro avg       0.87      0.77      0.81     10521
weighted avg       0.98      0.98      0.98     10521

3차

            precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     10227
         1.0       0.76      0.55      0.64       294

    accuracy                           0.98     10521
   macro avg       0.88      0.77      0.81     10521
weighted avg       0.98      0.98      0.98     10521

728x90

저작자표시 비영리 변경금지

머신러닝 :: 분류 리포트 1-2 hr 차

airmaster 2022. 6. 29. 15:09

2022. 6. 29. 15:09

728x90

모델링 평가자료

1차 데이터셋: qc_ASOS 데이터 (2-3시간 차이)

2차 데이터셋: mrg_ASOS_AAOS 데이터 (1-2시간 차이) 오직 y=flag

3차 데이터셋: mrg_ASOS_AAOS 데이터 (1-2시간 차이) x에 vis_log 를 추가하고 y=flag

데이터셋 주의 사항

vis_10m는 제외해야 된다.

오로지 flag로만 비교할때 보다, vis_log가 남아 있을 때 결과가 약간 더 향상되는 경우가 있다.

코드 예제 - 의사결정 나무

#의사결정 나무
## 데이터 학습
from sklearn import tree
clf_tree = tree.DecisionTreeClassifier(random_state=0)
clf_tree.fit(X_train_std, y_train)

## 데이터 예측
pred_tree = clf_tree.predict(X_test_std)
print(pred_tree)

## 모델  스코어 평가
get_clf_eval(y_test, pred_tree)

## 분류 리포트 확인
from sklearn.metrics import classification_report
class_report = classification_report(y_test, pred_tree)
print(class_report)

분류 리포트 (2차 결과)

의사결정 나무

| 1차 2차 > 3차

---------------------------

precision | 1.0

recall | 0.50

f1-score | 0.53

support | 0.52

2차

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     10198
         1.0       0.81      0.81      0.81       324

    accuracy                           0.99     10522
   macro avg       0.90      0.90      0.90     10522
weighted avg       0.99      0.99      0.99     10522

3차

  precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     10198
         1.0       0.81      0.80      0.80       324

    accuracy                           0.99     10522
   macro avg       0.90      0.90      0.90     10522
weighted avg       0.99      0.99      0.99     10522

랜덤포레스트

| 1차 2차 < 3차

---------------------------

precision | 1.0

recall | 1.00

f1-score | 0.09

support | 0.16

2차

             precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     10198
         1.0       1.00      0.70      0.82       324

    accuracy                           0.99     10522
   macro avg       1.00      0.85      0.91     10522
weighted avg       0.99      0.99      0.99     10522

3차

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     10198
         1.0       1.00      0.74      0.85       324

    accuracy                           0.99     10522
   macro avg       0.99      0.87      0.92     10522
weighted avg       0.99      0.99      0.99     10522

나이브 베이즈

| 1차 2차 < 3차

---------------------------

precision | 1.0

recall | 1.0

f1-score | 0.09

support | 0.16

2차

       precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     10198
         1.0       1.00      0.70      0.82       324

    accuracy                           0.99     10522
   macro avg       1.00      0.85      0.91     10522
weighted avg       0.99      0.99      0.99     10522

3차

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     10198
         1.0       1.00      0.74      0.85       324

    accuracy                           0.99     10522
   macro avg       0.99      0.87      0.92     10522
weighted avg       0.99      0.99      0.99     10522

에이다 부스트

| 1차 2차 < 3차

---------------------------

precision | 1.0

recall | 0.75

f1-score | 0.54

support | 0.63

2차

           precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     10198
         1.0       0.93      0.79      0.85       324

    accuracy                           0.99     10522
   macro avg       0.96      0.89      0.93     10522
weighted avg       0.99      0.99      0.99     10522

3차

             precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     10198
         1.0       0.93      0.82      0.87       324

    accuracy                           0.99     10522
   macro avg       0.96      0.91      0.93     10522
weighted avg       0.99      0.99      0.99     10522

그레디언트 부스트 (시간 걸림)

| 1차 2차 > 3차

---------------------------

precision | 1.0

recall | 0.38

f1-score | 0.03

support | 0.06

2차

             precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     10198
         1.0       0.92      0.83      0.87       324

    accuracy                           0.99     10522
   macro avg       0.96      0.91      0.94     10522
weighted avg       0.99      0.99      0.99     10522

3차

             precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     10198
         1.0       0.92      0.81      0.86       324

    accuracy                           0.99     10522
   macro avg       0.96      0.90      0.93     10522
weighted avg       0.99      0.99      0.99     10522

스태킹

| 1차 2차 = 3차.

---------------------------

precision | 1.0

recall | 0.70

f1-score | 0.56

support | 0.62

2차

             precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     10198
         1.0       0.96      0.77      0.85       324

    accuracy                           0.99     10522
   macro avg       0.98      0.88      0.92     10522
weighted avg       0.99      0.99      0.99     10522

3차

             precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     10198
         1.0       0.96      0.77      0.85       324

    accuracy                           0.99     10522
   macro avg       0.98      0.88      0.92     10522
weighted avg       0.99      0.99      0.99     10522

728x90

저작자표시 비영리 변경금지

PREV 이전 1 ···4 5 6 7 8 9 10 ···17 NEXT 다음

기상 기후 강의 노트

연구 노트

모델링 :: xgboost with Gfilled_Dexpanded_1hr

모델링 :: ML 테스트 with Gfilled_Dexpaned_2hr

모델링 :: ML 테스트 with Gfilled_Dexpaned_1hr (ASOS-AAOS)

전처리 :: Data expansion after gap-filling 1hr

파이썬 전처리 :: 여러가지 ML 테스트 (MRG_Gfilled_Dexpanding_분류.py)

파이썬 전처리 :: ASOS AAOS AQ 자료 병합

특정 사이트 자료 ASOS AAOS AQ 모두 합치기

파이썬 전처리 :: ASOS 전처리(1) - 사용가능 변수 살피기

ASOS 전처리 Step 1

1) 기상데이터 포털에서 각 연도별 OBS_ASOS_TIM_XXXX.csv 다운로드

2) 각 사이트별로 변수 그림 그리고, 값이 존재하는 사용 가능한 변수명 확인

(1) OBS_ASOS_TIM_comb.csv 파일 읽기

변수명 영문으로 변환

(2) 각 사이트별 관측 변수 그림 그리고, 사용가능한 변수 체크

pd.merge 에서 how='outer' 와 on = 'date' 차이

머신러닝 :: 분류 리포트 2-3hr 차 결과

머신러닝 :: 분류 리포트 1-2 hr 차

모델링 평가자료

코드 예제 - 의사결정 나무

분류 리포트 (2차 결과)

+ Recent posts

티스토리툴바