연구 노트/탄소 기상 예측

파이썬 전처리 :: ASOS AAOS AQ 자료 병합

airmaster 2022. 7. 5. 18:11
728x90

ASOS_AAOS_AQ_pre_mrg.py

 

특정 사이트 자료 ASOS AAOS AQ 모두 합치기

In [1]:
import keras
print(keras.__version__)
import tensorflow as tf
print(tf.__version__)
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # for GPU_1

from __future__ import absolute_import, division, print_function, unicode_literals
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import pandas as pd
from download import download

import matplotlib.pyplot as plt
import seaborn as sns
2.7.0
2.7.0
In [2]:
# os.system("dir C:") #, shell=True)
# os.system("dir", shell=True)
file_asos = 'D:/dataset/my_data/ASOS_merg_2016-2021_136_Andong.csv'
file_aaos = 'D:/dataset/my_data/AAOS_merg_2016-2021_972_Ahndong.csv'
file_aq = 'D:/dataset/my_data/AQ_merge_2016-2021.csv'

asos = pd.read_csv(file_asos, encoding='UTF8')  # cp949  로 하면 error 발생
aaos = pd.read_csv(file_aaos, encoding='UTF8')  # cp949 로 하면 Date 포맷이 asos 와 불일치 하므로 UTF-8로 고정
aq = pd.read_csv(file_aq, encoding='cp949') #, index=0)
## 만약 에러가 발생하면, UTF8, cp949를 바꿔서 몇번 해 보면(?) 에러가 사라진다.
In [3]:
asos.head()
Out[3]:
Site_NoSiteDateTair_CTair_flagRain_mmRain_flagWS_m_sWS_flagWD_16deg...Tsfc_CTsfc_flagT5cm_CT10cm_CT20cm_CT30cm_CYearMonthDayHour01234
136 안동 2016-01-01 00:00:00 -3.2 0.0 NaN 9.0 0.7 0.0 140.0 ... -5.1 0.0 NaN NaN NaN NaN 2016 1 1 0
136 안동 2016-01-01 01:00:00 -3.9 0.0 NaN NaN 0.8 0.0 140.0 ... -5.4 0.0 NaN NaN NaN NaN 2016 1 1 1
136 안동 2016-01-01 02:00:00 -5.3 0.0 NaN NaN 0.2 0.0 0.0 ... -6.2 0.0 NaN NaN NaN NaN 2016 1 1 2
136 안동 2016-01-01 03:00:00 -4.8 0.0 NaN NaN 1.1 0.0 140.0 ... -6.2 0.0 NaN NaN NaN NaN 2016 1 1 3
136 안동 2016-01-01 04:00:00 -6.3 0.0 NaN NaN 0.4 0.0 0.0 ... -6.9 0.0 NaN NaN NaN NaN 2016 1 1 4

5 rows × 42 columns

In [4]:
if asos['Site'][1] == '안동': 
    site_name = 'Andong'    
site_name
Out[4]:
'Andong'
In [5]:
asos.drop(['Site_No', 'Site', 'Tair_flag', 'WS_flag', 'WD_flag', 'RH_flag',
           'Pa_flag', 'Psfc_flag','Suntime_flag','Sunrad_flag',
           'Snow_cm', 'Snow_3hr_cm','Sfc_stat','Tsfc_flag',
           'Cloud_cover_total','Cloud_cover_mid_low','Cloud_type','Cloud_BH_100m',
           'T5cm_C','T10cm_C','T20cm_C','T30cm_C',
           'Year', 'Month', 'Day', 'Hour'], inplace =True, axis=1)
asos.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52608 entries, 0 to 52607
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          52608 non-null  object 
 1   Tair_C        52606 non-null  float64
 2   Rain_mm       4373 non-null   float64
 3   Rain_flag     10012 non-null  float64
 4   WS_m_s        52608 non-null  float64
 5   WD_16deg      52608 non-null  float64
 6   RH_pct        52608 non-null  float64
 7   Pv_hPa        52604 non-null  float64
 8   Td_C          52602 non-null  float64
 9   Pa_hPa        52604 non-null  float64
 10  Psfc_hPa      52605 non-null  float64
 11  Suntime_hr    28720 non-null  float64
 12  Sunrad_MJ_m2  28756 non-null  float64
 13  Vis_10m       52606 non-null  float64
 14  Phen          15943 non-null  float64
 15  Tsfc_C        52599 non-null  float64
dtypes: float64(15), object(1)
memory usage: 6.4+ MB
In [6]:
aaos.drop(['Site_No', 'Site', 
           'Tsoil_20cm','Tsoil_30cm','Tsoil_0.5m','Tsoil_1.5m', 'Tsoil_3.0m', 'Tsoil_5.0m', 
           'Watrlev_cm'
           ], inplace=True, axis=1)
aaos.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51385 entries, 0 to 51384
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           51385 non-null  object 
 1   RH_0.5m        24687 non-null  float64
 2   RH_1.5m        51376 non-null  float64
 3   Msoil_10cm     50466 non-null  float64
 4   Msoil_20cm     50466 non-null  float64
 5   Msoil_30cm     50466 non-null  float64
 6   Msoil_50cm     50466 non-null  float64
 7   Tair_0.5m      24692 non-null  float64
 8   Tair_1.5m      51378 non-null  float64
 9   Tair_4.0m      24691 non-null  float64
 10  WS_1.5m        24510 non-null  float64
 11  WS_4.0m        24510 non-null  float64
 12  Tsfc_0m        51384 non-null  float64
 13  Tgrass_0m      50413 non-null  float64
 14  Tsoil_5cm      51385 non-null  float64
 15  Tsoil_10cm     51371 non-null  float64
 16  Tsoil_1.0m     51383 non-null  float64
 17  Radnet_MJ_m2   24694 non-null  float64
 18  Radglob_MJ_m2  24694 non-null  float64
 19  Radrefl_MJ_m2  24694 non-null  float64
 20  Illum_10lux    23683 non-null  float64
 21  Year           51385 non-null  int64  
 22  Month          51385 non-null  int64  
 23  Day            51385 non-null  int64  
 24  Hour           51385 non-null  int64  
dtypes: float64(20), int64(4), object(1)
memory usage: 9.8+ MB
In [7]:
aq = aq[['Datetime', 'SO2', 'NO2', 'O3', 'PM10', 'PM25']]
aq.rename(columns = {'Datetime':'Date'}, inplace =True)
aq.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50400 entries, 0 to 50399
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    50400 non-null  object 
 1   SO2     47034 non-null  float64
 2   NO2     45531 non-null  float64
 3   O3      46572 non-null  float64
 4   PM10    45919 non-null  float64
 5   PM25    33466 non-null  float64
dtypes: float64(5), object(1)
memory usage: 2.3+ MB
In [8]:
asos.head()
Out[8]:
DateTair_CRain_mmRain_flagWS_m_sWD_16degRH_pctPv_hPaTd_CPa_hPaPsfc_hPaSuntime_hrSunrad_MJ_m2Vis_10mPhenTsfc_C01234
2016-01-01 00:00:00 -3.2 NaN 9.0 0.7 140.0 83.0 4.0 -5.6 1015.4 1033.5 NaN NaN 400.0 19.0 -5.1
2016-01-01 01:00:00 -3.9 NaN NaN 0.8 140.0 85.0 3.9 -6.0 1015.1 1033.2 NaN NaN 300.0 19.0 -5.4
2016-01-01 02:00:00 -5.3 NaN NaN 0.2 0.0 88.0 3.6 -6.9 1015.1 1033.3 NaN NaN 300.0 19.0 -6.2
2016-01-01 03:00:00 -4.8 NaN NaN 1.1 140.0 87.0 3.7 -6.6 1015.0 1033.2 NaN NaN 300.0 19.0 -6.2
2016-01-01 04:00:00 -6.3 NaN NaN 0.4 0.0 89.0 3.4 -7.8 1015.3 1033.6 NaN NaN 300.0 19.0 -6.9
In [9]:
aaos.head()
Out[9]:
DateRH_0.5mRH_1.5mMsoil_10cmMsoil_20cmMsoil_30cmMsoil_50cmTair_0.5mTair_1.5mTair_4.0m...Tsoil_10cmTsoil_1.0mRadnet_MJ_m2Radglob_MJ_m2Radrefl_MJ_m2Illum_10luxYearMonthDayHour01234
2016-01-01 00:00:00 96.3 98.7 7.0 9.9 15.2 15.8 -4.1 -3.6 -4.9 ... -1.1 6.8 5.65 10.63 4.98 578.0 2016 1 1 0
2016-01-01 01:00:00 97.2 99.8 6.5 10.0 15.2 15.8 -5.0 -4.5 -5.9 ... -1.4 6.8 -0.16 0.00 0.16 568.0 2016 1 1 1
2016-01-01 02:00:00 97.5 97.9 6.0 10.0 15.2 15.8 -5.3 -4.8 -6.4 ... -1.8 6.7 -0.32 0.02 0.34 607.0 2016 1 1 2
2016-01-01 03:00:00 97.4 98.7 5.7 10.1 15.2 15.8 -5.8 -5.5 -6.7 ... -2.1 6.7 -0.44 0.07 0.51 597.0 2016 1 1 3
2016-01-01 04:00:00 96.3 96.4 5.4 10.1 15.2 15.8 -6.4 -5.9 -7.5 ... -2.5 6.7 -0.55 0.14 0.69 579.0 2016 1 1 4

5 rows × 25 columns

In [10]:
aq.head()
Out[10]:
DateSO2NO2O3PM10PM2501234
2016-01-01 01:00:00 0.001 0.026 0.003 75.0 NaN
2016-01-01 02:00:00 0.001 0.022 0.003 81.0 NaN
2016-01-01 03:00:00 0.001 0.021 0.003 77.0 NaN
2016-01-01 04:00:00 0.001 0.019 0.003 73.0 NaN
2016-01-01 05:00:00 0.001 0.018 0.003 73.0 NaN
In [11]:
print(len(asos))
print(len(aaos))
print(len(aq))
52608
51385
50400
In [ ]:
 
In [12]:
### ASOS 기준으로 AAOS  합치기.  Datetime 컬럼 기준
# filename_out = "ASOSAQ_filled_" + "_" + str(site_name) + ".csv"
tmp = pd.merge(asos, aaos, how="outer")   #주의 !! on='Date'를 사용하면 Date 51385 로 자료 없는 곳은 빠짐.
print(len(tmp))
tmp.head()
tmpfile='D:/dataset/my_data/mrg_asos_aaos_' + str(site_name) + '.csv'
tmp.to_csv(tmpfile, header=True, index=False)
52608
In [13]:
print(len(tmp))
52608
In [14]:
### 위에서 합친 ASOS-AAOS 기준으로 다시 AQ와 합치기  Datetime 컬럼 기준
mrg = pd.merge(tmp, aq, on='Date')
mrg.head()
Out[14]:
DateTair_CRain_mmRain_flagWS_m_sWD_16degRH_pctPv_hPaTd_CPa_hPa...Illum_10luxYearMonthDayHourSO2NO2O3PM10PM2501234
2016-01-01 01:00:00 -3.9 NaN NaN 0.8 140.0 85.0 3.9 -6.0 1015.1 ... 568.0 2016.0 1.0 1.0 1.0 0.001 0.026 0.003 75.0 NaN
2016-01-01 02:00:00 -5.3 NaN NaN 0.2 0.0 88.0 3.6 -6.9 1015.1 ... 607.0 2016.0 1.0 1.0 2.0 0.001 0.022 0.003 81.0 NaN
2016-01-01 03:00:00 -4.8 NaN NaN 1.1 140.0 87.0 3.7 -6.6 1015.0 ... 597.0 2016.0 1.0 1.0 3.0 0.001 0.021 0.003 77.0 NaN
2016-01-01 04:00:00 -6.3 NaN NaN 0.4 0.0 89.0 3.4 -7.8 1015.3 ... 579.0 2016.0 1.0 1.0 4.0 0.001 0.019 0.003 73.0 NaN
2016-01-01 05:00:00 -6.8 NaN NaN 0.4 0.0 89.0 3.3 -8.3 1014.8 ... 555.0 2016.0 1.0 1.0 5.0 0.001 0.018 0.003 73.0 NaN

5 rows × 45 columns

In [16]:
tmpfile='D:/dataset/my_data/mrg_asos_aaos_aq_' + str(site_name) + '.csv'
mrg.to_csv(tmpfile, header=True, index=False)
In [17]:
tmp2 = pd.read_csv(tmpfile)
tmp2.head()
Out[17]:
DateTair_CRain_mmRain_flagWS_m_sWD_16degRH_pctPv_hPaTd_CPa_hPa...Illum_10luxYearMonthDayHourSO2NO2O3PM10PM2501234
2016-01-01 01:00:00 -3.9 NaN NaN 0.8 140.0 85.0 3.9 -6.0 1015.1 ... 568.0 2016.0 1.0 1.0 1.0 0.001 0.026 0.003 75.0 NaN
2016-01-01 02:00:00 -5.3 NaN NaN 0.2 0.0 88.0 3.6 -6.9 1015.1 ... 607.0 2016.0 1.0 1.0 2.0 0.001 0.022 0.003 81.0 NaN
2016-01-01 03:00:00 -4.8 NaN NaN 1.1 140.0 87.0 3.7 -6.6 1015.0 ... 597.0 2016.0 1.0 1.0 3.0 0.001 0.021 0.003 77.0 NaN
2016-01-01 04:00:00 -6.3 NaN NaN 0.4 0.0 89.0 3.4 -7.8 1015.3 ... 579.0 2016.0 1.0 1.0 4.0 0.001 0.019 0.003 73.0 NaN
2016-01-01 05:00:00 -6.8 NaN NaN 0.4 0.0 89.0 3.3 -8.3 1014.8 ... 555.0 2016.0 1.0 1.0 5.0 0.001 0.018 0.003 73.0 NaN

5 rows × 45 columns

728x90