/ PROJECTS

R - Air Pollution Data Analysis

1. About

  • 2020-2, Data Science and R, Final Proejct: Air Pollution Data Analysis

2. Load Data and Preprocess

  • Collected data from Air Korea (https://www.airkorea.or.kr)
library(dplyr)
library(tidyverse)
library(grid)
library(vcd)
library(ggplot2)
library(readxl)
data= read_xlsx("./final_data/data.xlsx");
head(data,5)
nrow(data)
날짜시도측정소명측정소코드아황산가스일산화탄소오존이산화질소PM10PM2.5
2017-07-01 01인천 옹진군 백령도 831492 .0016 .3 .052 .0016 44 35
2017-07-01 02인천 옹진군 백령도 831492 .0016 .3 .052 .0017 21 NA
2017-07-01 03인천 옹진군 백령도 831492 .0017 .3 .051 .0017 32 18
2017-07-01 04인천 옹진군 백령도 831492 .0017 .3 .05 .002 10 NA
2017-07-01 05인천 옹진군 백령도 831492 .0016 .3 .048 .0018 24 22

26304

2.1. Set Attributes

#   시도, 측정소명, 측정소코드는 모두 같으므로 제외
#   날짜는 시구간별 분석을 위해 연, 월, 일, 시로 분리: int타입으로 반환됨
#   측정 대상 오염 물질의 자료형 변환
data <- data %>%
  select(-c(시도,측정소명,측정소코드)) %>%
  separate(col=날짜,
           into=c("year","month","day","hour"),
           convert=TRUE) %>%
  mutate_if(is.character,as.numeric) %>%
  rename(SO2=아황산가스,CO=일산화탄소,O3=오존,NO2=이산화질소)
head(data,5)
yearmonthdayhourSO2COO3NO2PM10PM2.5
2017 7 1 1 0.00160.3 0.052 0.001644 35
2017 7 1 2 0.00160.3 0.052 0.001721 NA
2017 7 1 3 0.00170.3 0.051 0.001732 18
2017 7 1 4 0.00170.3 0.050 0.002010 NA
2017 7 1 5 0.00160.3 0.048 0.001824 22

2.2. Handling NA values

# 일평균 값을 확인
daily <- data %>%
  group_by(year,month,day) %>%
  select_if(is.double) %>%
  summarise_all(mean,na.rm=TRUE) %>%
  ungroup()
summary(daily)
      year          month             day             SO2           
 Min.   :2017   Min.   : 1.000   Min.   : 1.00   Min.   :0.0001765  
 1st Qu.:2018   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:0.0012553  
 Median :2018   Median : 7.000   Median :16.00   Median :0.0017042  
 Mean   :2018   Mean   : 6.522   Mean   :15.73   Mean   :0.0018339  
 3rd Qu.:2019   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:0.0022417  
 Max.   :2020   Max.   :12.000   Max.   :31.00   Max.   :0.0078375  
                                                 NA's   :1          
       CO                O3               NO2                PM10       
 Min.   :0.04583   Min.   :0.01188   Min.   :0.000850   Min.   :  2.00  
 1st Qu.:0.23750   1st Qu.:0.03453   1st Qu.:0.002373   1st Qu.: 20.88  
 Median :0.32917   Median :0.04290   Median :0.003296   Median : 29.25  
 Mean   :0.37287   Mean   :0.04405   Mean   :0.003853   Mean   : 35.58  
 3rd Qu.:0.46667   3rd Qu.:0.05192   3rd Qu.:0.004764   3rd Qu.: 42.54  
 Max.   :1.39167   Max.   :0.09971   Max.   :0.020209   Max.   :217.04  
 NA's   :2                                              NA's   :5       
     PM2.5        
 Min.   :  1.826  
 1st Qu.: 10.042  
 Median : 14.625  
 Mean   : 18.736  
 3rd Qu.: 22.375  
 Max.   :126.375  
 NA's   :23       
# NA: 하루 전체 관측치가 NA인 경우에 발생
# 월평균 값을 확인
monthly <- data %>%
  group_by(year,month) %>%
  select_if(is.double) %>%
  summarise_all(mean,na.rm=TRUE) %>%
  ungroup()
summary(monthly)
      year          month            SO2                 CO        
 Min.   :2017   Min.   : 1.00   Min.   :0.001099   Min.   :0.1911  
 1st Qu.:2018   1st Qu.: 3.75   1st Qu.:0.001394   1st Qu.:0.2646  
 Median :2018   Median : 6.50   Median :0.001789   Median :0.3562  
 Mean   :2018   Mean   : 6.50   Mean   :0.001832   Mean   :0.3730  
 3rd Qu.:2019   3rd Qu.: 9.25   3rd Qu.:0.002084   3rd Qu.:0.4491  
 Max.   :2020   Max.   :12.00   Max.   :0.003140   Max.   :0.7588  
       O3               NO2                PM10           PM2.5      
 Min.   :0.02447   Min.   :0.001857   Min.   :20.24   Min.   :10.62  
 1st Qu.:0.03821   1st Qu.:0.002876   1st Qu.:27.31   1st Qu.:13.94  
 Median :0.04175   Median :0.004066   Median :35.42   Median :18.41  
 Mean   :0.04410   Mean   :0.003850   Mean   :35.69   Mean   :18.92  
 3rd Qu.:0.05154   3rd Qu.:0.004658   3rd Qu.:42.46   3rd Qu.:22.84  
 Max.   :0.06422   Max.   :0.005683   Max.   :58.00   Max.   :36.51  
# NA값들을 일평균 또는 월평균 값으로 대체
for (i in (1:nrow(data))){
  for (j in c("SO2","CO","O3","NO2","PM10","PM2.5")){
    if (is.na(data[i,][[j]])){
      tmp<-data[i,]
      y<-tmp$year
      m<-tmp$month
      d<-tmp$day
      if (!is.nan(daily[daily$year==y & daily$month==m & daily$day==d,][[j]])){
        data[i,][[j]]<-daily[daily$year==y & daily$month==m & daily$day==d,][[j]]
      }else{
        data[i,][[j]]<-monthly[monthly$year==y & monthly$month==m,][[j]]
      }
    }
  }
}
summary(data)
      year          month             day             hour      
 Min.   :2017   Min.   : 1.000   Min.   : 1.00   Min.   : 1.00  
 1st Qu.:2018   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 6.75  
 Median :2018   Median : 7.000   Median :16.00   Median :12.50  
 Mean   :2018   Mean   : 6.522   Mean   :15.73   Mean   :12.50  
 3rd Qu.:2019   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:18.25  
 Max.   :2020   Max.   :12.000   Max.   :31.00   Max.   :24.00  
      SO2                 CO               O3               NO2          
 Min.   :0.000000   Min.   :0.0000   Min.   :0.00100   Min.   :0.000300  
 1st Qu.:0.001200   1st Qu.:0.2000   1st Qu.:0.03400   1st Qu.:0.002100  
 Median :0.001700   Median :0.3000   Median :0.04300   Median :0.003000  
 Mean   :0.001834   Mean   :0.3726   Mean   :0.04405   Mean   :0.003853  
 3rd Qu.:0.002300   3rd Qu.:0.5000   3rd Qu.:0.05200   3rd Qu.:0.004600  
 Max.   :0.034700   Max.   :2.5000   Max.   :0.13200   Max.   :0.045900  
      PM10            PM2.5       
 Min.   :  1.00   Min.   :  0.00  
 1st Qu.: 19.00   1st Qu.:  9.00  
 Median : 28.00   Median : 14.00  
 Mean   : 35.57   Mean   : 18.78  
 3rd Qu.: 43.00   3rd Qu.: 22.77  
 Max.   :461.00   Max.   :219.00  

3. EDA

3.1. 통합대기환경지수(Comprehensive air-quality index), CAI 계산

### 통합대기환경지수CAI 지수를 계산
# 항목별 계산 함수를 정의
SO2_I_p <- function(C_p){
  cut <- c(-Inf,0,0.02,0.05,0.15,1,Inf)
  BP_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0.02,0.02,0.05,0.15,1,1))))
  BP_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0,0,0.021,0.051,0.151,0.151))))
  I_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(50,50,100,250,500,500))))
  I_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0,0,51,101,251,251))))
  I_p <- (((I_hi-I_lo)/(BP_hi-BP_lo)) * (C_p - BP_lo)) + I_lo
  return(I_p)
}

CO_I_p <- function(C_p){
  cut <- c(-Inf,0,2,9,15,50,Inf)
  BP_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(2,2,9,15,50,50))))
  BP_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0,0,2.01,9.01,15.01,15.01))))
  I_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(50,50,100,250,500,500))))
  I_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0,0,51,101,251,251))))
  I_p <- (I_hi-I_lo)/(BP_hi-BP_lo) * (C_p - BP_lo) + I_lo
  return(I_p)
}

O3_I_p <- function(C_p){
  cut <- c(-Inf,0,0.03,0.09,0.15,0.6,Inf)
  BP_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0.03,0.03,0.09,0.15,0.6,0.6))))
  BP_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0,0,0.031,0.091,0.151,0.151))))
  I_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(50,50,100,250,500,500))))
  I_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0,0,51,101,251,251))))
  I_p <- (I_hi-I_lo)/(BP_hi-BP_lo) * (C_p - BP_lo) + I_lo
  return(I_p)
}

NO2_I_p <- function(C_p){
  cut <- c(-Inf,0,0.03,0.06,0.2,2,Inf)
  BP_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0.03,0.03,0.06,0.2,2,2))))
  BP_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0,0,0.031,0.061,0.201,0.201))))
  I_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(50,50,100,250,500,500))))
  I_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0,0,51,101,251,251))))
  I_p <- (I_hi-I_lo)/(BP_hi-BP_lo) * (C_p - BP_lo) + I_lo
  return(I_p)
}

# PM10, PM2.5의 이동평균을 구하기 위한 함수 정의
C12_cal <- function(PM){
  C12 <- c()
  for (i in (1:length(PM))){
    if (i<12){
      tmp <- sum(PM[1:i])/i
    }else{
      tmp <- sum(PM[(i-11):i])/12
    }
    C12[i] <- tmp
  }
  return (C12)
}
C_ai_cal <- function(C_i,M,C12){
  if (C_i < M){
    C_ai <- C_i
  }else if(0.9<= (C_i/C12) && (C_i/C12) <=1.7){
    C_ai <- 0.75*C_i  
  }else {
    C_ai <- C_i
  }
  return (C_ai)
}
PM_cal <- function(PM,m,PM_C12){
  M=m
  C24E=c()
  for (i in (1:length(PM))){
    if (i<4){
      C4 <-0
      for (j in (i:1)){
        C4 <- C4+ C_ai_cal(PM[j],M,PM_C12[j])
      }
      C4/i
    } else{
      C4 <- sum(C_ai_cal(PM[i],M,PM_C12[i]),C_ai_cal(PM[i-1],M,PM_C12[i-1]),
              C_ai_cal(PM[i-2],M,PM_C12[i-2]),C_ai_cal(PM[i-3],M,PM_C12[i-3]))/4
    }
    C12<-PM_C12[i]
    C24E[i] = (C12*12+C4*12)/24
  }
  return (C24E)
}

PM10_I_p <- function(C_p){
  cut <- c(-Inf,0,30,80,150,600,Inf)
  BP_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE,
                                       labels=c(30,30,80,150,600,600))))
  BP_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE,
                                       labels=c(0,0,31,81,151,151))))
  I_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE,
                                      labels=c(50,50,100,250,500,500))))
  I_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE,
                                      labels=c(0,0,51,101,251,251))))
  I_p <- (I_hi-I_lo)/(BP_hi-BP_lo) * (C_p - BP_lo) + I_lo
  return(I_p)
}

PM2.5_I_p <- function(C_p){
  cut <- c(-Inf,0,15,35,75,500,Inf)
  BP_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(15,15,35,75,500,500))))
  BP_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0,0,16,36,76,76))))
  I_hi <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(50,50,100,250,500,500))))
  I_lo <- as.numeric(as.character(cut(C_p, breaks=cut,right=TRUE,
                include.lowest = FALSE, labels=c(0,0,51,101,251,251))))
  I_p <- (I_hi-I_lo)/(BP_hi-BP_lo) * (C_p - BP_lo) + I_lo
  return(I_p)
}

PM10_C12<-C12_cal(data$PM10)
PM10_C_p <- PM_cal(data$PM10,70,PM10_C12)

PM2.5_C12 <- C12_cal(data$PM2.5)
PM2.5_C_p <- PM_cal(data$PM2.5,30,PM2.5_C12)

data <- data %>%
  bind_cols("SO2_I_p" = SO2_I_p(data$SO2)) %>%
  bind_cols("CO_I_p" = CO_I_p(data$CO)) %>%
  bind_cols("O3_I_p" = O3_I_p(data$O3)) %>%
  bind_cols("NO2_I_p" = NO2_I_p(data$NO2)) %>%
  bind_cols("PM10_I_p" = PM10_I_p(PM10_C_p)) %>%
  bind_cols("PM2.5_I_p" = PM2.5_I_p(PM2.5_C_p))
CAI_table <- data %>%
  select(ends_with("I_p")) %>%
  mutate(BAD = rowSums(. >100)) %>%
  mutate(CAI = apply(., 1, max) + case_when(BAD>=3 ~ 75, BAD>=2 ~ 50,TRUE ~ 0)) %>%
  mutate(CAI_index = case_when(CAI>250 ~ 3, CAI>100 ~ 2,
                               CAI>50 ~ 1, TRUE ~ 0)) %>%
  select(starts_with("CAI"))
data <- data %>%
  bind_cols(CAI_table)
head(data[c(11:18)],5)
SO2_I_pCO_I_pO3_I_pNO2_I_pPM10_I_pPM2.5_I_pCAICAI_index
4.00 7.5 68.44068 2.666667 64.00000 88.71711 88.717111
4.00 7.5 68.44068 2.833333 68.75000 103.38782103.387822
4.25 7.5 67.61017 2.833333 84.66667 131.82942131.829422
4.25 7.5 66.77966 3.333333 44.58333 66.44682 66.779661
4.00 7.5 65.11864 3.000000 39.95833 64.79737 65.118641

3.2. Timestep-wise Visualization

  • 시구간별 시각화 및 분석

3.2.1. Hourly

# 시간대(주간: 07~18, 야간: 19~06)에 따른 CAI 지수 분포
CAI_hourly <- data %>%
  mutate(time_tmp = case_when(
    06<hour & hour<19 ~ "daytime", TRUE ~ "nighttime")) %>%
  group_by(year,month,day,time_tmp) %>%
  summarise(mean_CAI=mean(CAI)) %>%
  mutate(CAI_idx = case_when(mean_CAI>250 ~ 3, mean_CAI>100 ~ 2,
                             mean_CAI>50 ~ 1, TRUE ~ 0)) %>% # 해당 시간대의 평균 오염도
  ungroup()
# 시간에 따른 오염도의 분포는 거의 차이가 없음을 확인
CAI_hourly %>%
  group_by(time_tmp,CAI_idx)%>%
  tally()
`summarise()` has grouped output by 'year', 'month', 'day'. You can override using the `.groups` argument.
time_tmpCAI_idxn
daytime 0 100
daytime 1 886
daytime 2 93
daytime 3 17
nighttime0 73
nighttime1 895
nighttime2 115
nighttime3 13

3.2.2. Daily

### 일단위: 일평균 CAI 지수의 분포
CAI_daily <- data %>%
  group_by(year,month,day) %>%
  summarise_at(.vars="CAI", .funs=mean) %>%
  ungroup() %>%
  mutate(CAI_idx = case_when(CAI>250 ~ 3, CAI>100 ~ 2,
                             CAI>50 ~ 1, TRUE ~ 0))
summary(factor(CAI_daily$CAI_idx))
# 평균 수준보다 오염도가 높은 날들은 연중 특정 시기에 집중되어있음
CAI_daily %>%
  unite("date",1:3,sep="/") %>%
  mutate_at("date",as.Date) %>%
  ggplot() +
  geom_point(mapping=aes(x=date,y=CAI))

png

3.2.3. Montly

### 월단위: 오염 수준 나쁨 이상인 "일수" 확인
CAI_monthly <- CAI_daily %>%
  group_by(year,month) %>%
  summarise(mean_CAI = mean(CAI),sum_CAI_idx = sum(CAI_idx>1)) %>%
  mutate(m_tmp = case_when(month<10 ~ paste("0",as.character(month),sep=""),
                           TRUE ~ paste(as.character(month))))%>%
  mutate(y_tmp=as.character(year)) %>%
  unite(date,y_tmp,m_tmp,sep="")
# 6월~9월은 장마 영향으로 추정. 겨울~봄에 집중적.
CAI_monthly %>%
  ggplot() +
  geom_col(mapping=aes(x=date,y=sum_CAI_idx))
`summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

png

3.2.4. Quarterly

### 분기 단위
CAI_qtr <- CAI_monthly %>%
  mutate(qtr_tmp = case_when(
    month < 4 ~ "Q1", month < 7 ~ "Q2", month < 10 ~ "Q3", TRUE ~ "Q4")) %>%
  mutate(y_tmp=as.character(year)) %>%
  unite(Qtr, y_tmp, qtr_tmp, sep="-") %>%
  group_by(Qtr) %>%
  summarise(mean_CAI = mean(mean_CAI), sum_CAI_idx = sum(sum_CAI_idx))
CAI_qtr %>%
  ggplot() +
  geom_col(aes(x=Qtr,y=sum_CAI_idx))

png

3.2.5. Half-yearly

### 반기 단위: 2019년 상반기가 매우 심한 것이었음을 알 수 있음
CAI_half <- CAI_monthly %>%
  transform(date=as.integer(date)) %>%
  mutate(year = case_when(
    date < 201801 ~ "2017_H2", date < 201807 ~ "2018-H1",
    date < 201901 ~ "2018_H2", date < 201907 ~ "2019-H1",
    date < 202001 ~ "2019-H2", TRUE ~ "2020-H1")) %>%
  group_by(year) %>%
  summarise(mean_CAI = mean(mean_CAI), sum_CAI_idx = sum(sum_CAI_idx))
CAI_half %>%
  ggplot()+
  geom_col(aes(x=year,y=sum_CAI_idx))

png

4. Summary

대기오염도는 대체적으로 여름보다 겨울에 높았고, 2019년이 특히 심했다는 것을 알 수 있다. 2020년을 기준으로 비교했을 때, 코로나의 영향은 거의 없었던 것으로 추정된다.