참고 : 데이콘 주식 종료가격 예측 대회
월간 데이콘 주식 종료 가격 예측 Pre Competiton (with.데이커) - DACON
분석시각화 대회 코드 공유 게시물은 내용 확인 후 좋아요(투표) 가능합니다.
dacon.io
!pip install finance-datareader
import FinanceDataReader as fdr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, HuberRegressor
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
# 삼성전자(005930), 카카오 (035720), 하이브(352820), 더네이쳐홀딩스(298540), 코엔텍(029960)
samsung = fdr.DataReader('005930', '2022-11-01', '2023-04-21')
kakao = fdr.DataReader('035720', '2022-11-01', '2023-04-21')
hive = fdr.DataReader('352820', '2022-11-01', '2023-04-21')
nature = fdr.DataReader('298540', '2022-11-01', '2023-04-21')
cnt = fdr.DataReader('029960', '2022-11-01', '2023-04-21')
start_date = '20221101'
end_date = '20230421'
stock_code = ['005930', '035720', '352820', '298540', '029960']
start_weekday = pd.to_datetime(start_date).weekday()
max_weeknum = pd.to_datetime(end_date).strftime('%V')
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])
print(f'WEEKDAY of "start_date" : {start_weekday}')
print(f'NUM of WEEKS to "end_date" : {max_weeknum}')
print(f'HOW MANY "Business_days" : {Business_days.shape}', )
display(Business_days.head())
NMAE 점수 계산 함수
def get_prediction(x_close, y_close, x_close_public):
prediction_close = 0.0
for i, model in enumerate(models):
model.fit(x_close, y_close)
prediction_close += model.predict(np.expand_dims(x_close_public, 0))*models_rate[i]
return prediction_close
# 예측 결과 저장
pred_date = [20230424,20230425,20230426,20230427,20230428]
pred_table = pd.DataFrame({'Day' : pred_date})
EDA 결과 trend는 딱히 없어 추가 전처리를 하지 않고 진행 (모든 종목에서 적용)
기본 Logic : 월 ~ 금 데이터로 다음주 월 ~ 금 을 차례로 예측하여 주 단위 예측 진행
def NMAE(y_pred, y_train):
return np.mean(abs(y_train-y_pred)/y_train)*100
model1 = LinearRegression(n_jobs=-1)
model2 = RandomForestRegressor(criterion="mae")
model3 = xgb.XGBRegressor(n_jobs=-1, eval_metric=NMAE)
models = [model1, model2, model3]
model1_rate = 0.5
model2_rate = 0.4
model3_rate = 0.1
models_rate = [model1_rate, model2_rate, model3_rate]
for code in stock_code:
data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
data = pd.merge(Business_days, data, how = 'outer')
data['weekday'] = data.Date.apply(lambda x : x.weekday())
data['weeknum'] = data.Date.apply(lambda x : x.strftime('%V'))
data.Close = data.Close.ffill()
data.Close = data.Close.bfill()
data_close = pd.pivot_table(data = data, values = 'Close', columns = 'weekday', index = 'weeknum')
data_close_flatten = np.ravel(data_close.to_numpy())
predictions = []
# day 1
data_close_day1 = data_close_flatten.reshape((-1,1))
data_close_day1 = data_close_day1[np.logical_not(np.isnan(data_close_day1))]
x_close = data_close_day1[:-6]
y_close = data_close_day1[1:-5]
x_close_public = data_close_day1[-6]
x_close = x_close.reshape((-1,1))
y_close = y_close.reshape((-1,1))
x_close_public = x_close_public.reshape((-1,1))
prediction_close = get_prediction(x_close, y_close, x_close_public)
predictions.append(prediction_close)
# day 2
data_close_day2 = np.insert(data_close_flatten, 0, data_close_flatten[0])
data_close_day2 = data_close_day2[:-1]
data_close_day2 = data_close_day2.reshape((-1,2))
data_close_day2 = data_close_day2[np.logical_not(np.isnan(data_close_day2))]
x_close = data_close_day2[:-3]
y_close = data_close_day2[1:-2][:, 1]
x_close_public = data_close_day2[-3]
prediction_close = get_prediction(x_close, y_close, x_close_public)
predictions.append(prediction_close)
# day 3
data_close_day3 = data_close_flatten[1:]
data_close_day3 = np.append(data_close_day3, 0)
data_close_day3 = data_close_day3.reshape((-1, 3))
data_close_day3 = data_close_day3[np.logical_not(np.isnan(data_close_day3))]
x_close = data_close_day3[:-3]
y_close = data_close_day3[1:-2][:, 2]
x_close_public = data_close_day3[-3]
prediction_close = get_prediction(x_close, y_close, x_close_public)
predictions.append(prediction_close)
# day 4
data_close_day4 = np.insert(data_close_flatten, 0, data_close_flatten[0])
data_close_day4 = data_close_day4[:-1]
data_close_day4 = data_close_day4.reshape((-1, 4))
data_close_day4 = data_close_day4[np.logical_not(np.isnan(data_close_day4))]
x_close = data_close_day4[:-2]
y_close = data_close_day4[1:-1][:, 3]
x_close_public = data_close_day4[-2]
prediction_close = get_prediction(x_close, y_close, x_close_public)
predictions.append(prediction_close)
# day 5
data_close_day5 = data_close_flatten.reshape((-1, 5))
data_close_day5 = data_close_day5[np.logical_not(np.isnan(data_close_day5))]
x_close = data_close_day5[:-2]
y_close = data_close_day5[1:-1][:, 4]
x_close_public = data_close_day5[-2]
prediction_close = get_prediction(x_close, y_close, x_close_public)
predictions.append(prediction_close)
pred_table.loc[:5, stock_code] = predictions
pred_table.isna().sum().sum()
<성능>
EDA로 ACF, PACF를 그린 결과 종목에 상관없이 다음과 같이 유사한 절단점을 보임.
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
data.index = data['Date']
data.drop('Date', axis=1, inplace=True)
data.columns = stock_code
data.fillna(method='bfill', inplace=True)
# Public
train = data[:-15]
for col in train.columns:
model = ARIMA(train[col].values, order = (0,1,1))
model_fit = model.fit(trend = 'c', full_output = True, disp = True)
forecast_data = model_fit.forecast(steps=5)
pred_arima_y = forecast_data[0].tolist()
pred_table[col][:5] = pred_arima_y
# Private
train = data
for col in train.columns:
model = ARIMA(train[col].values, order = (0,1,1))
model_fit = model.fit(trend = 'c', full_output = True, disp = True)
forecast_data = model_fit.forecast(steps=5)
pred_arima_y = forecast_data[0].tolist()
pred_table[col][5:] = pred_arima_y
<성능>
위와 동일하게 period = 5 (다음주 영업일 예측)로 설정했지만 Prophet은 period = 20(다음달 영업일 예측) 성능이 더 좋게 나왔음.
# Period = 5
for i in stock_code:
globals()['m_{}'.format(i)] = Prophet()
locals()['m_{}'.format(i)].fit(locals()['df_{}'.format(i)])
globals()['future_prices_{}'.format(i)] = locals()['m_{}'.format(i)].make_future_dataframe(periods=5)
globals()['forecast_{}'.format(i)] = locals()['m_{}'.format(i)].predict(locals()['future_prices_{}'.format(i)])
globals()['fore_cast_{}'.format(i)] = locals()['forecast_{}'.format(i)][['ds', 'yhat']]
locals()['fore_cast_{}'.format(i)].rename(columns={'yhat': i}, inplace=True)
locals()['fore_cast_{}'.format(i)].rename(columns={'ds': 'Day'}, inplace=True)
sample_submission = pd.merge(sample_submission, locals()['fore_cast_{}'.format(i)], how='inner', on='Day')
# Period = 21
for i in stock_code:
globals()['m_{}'.format(i)] = Prophet()
locals()['m_{}'.format(i)].fit(locals()['df_{}'.format(i)])
globals()['future_prices_{}'.format(i)] = locals()['m_{}'.format(i)].make_future_dataframe(periods=21)
globals()['forecast_{}'.format(i)] = locals()['m_{}'.format(i)].predict(locals()['future_prices_{}'.format(i)])
globals()['fore_cast_{}'.format(i)] = locals()['forecast_{}'.format(i)][['ds', 'yhat']]
locals()['fore_cast_{}'.format(i)].rename(columns={'yhat': i}, inplace=True)
locals()['fore_cast_{}'.format(i)].rename(columns={'ds': 'Day'}, inplace=True)
sample_submission = pd.merge(sample_submission, locals()['fore_cast_{}'.format(i)], how='inner', on='Day')
<성능>
Prophet은 단기간 예측보다는 장기간 period 설정이 더 좋은 효과를 내는 것 같음.
https://facebook.github.io/prophet/docs/non-daily_data.html
Non-Daily Data
Prophet is a forecasting procedure implemented in R and Python. It is fast and provides completely automated forecasts that can be tuned by hand by data scientists and analysts.
facebook.github.io
<Ref>
시계열 분석 스터디 5주차(김연규): Prophet 논문 리뷰 (0) | 2023.05.11 |
---|---|
시계열 분석 스터디 3주차(김연규): ARIMA, SARIMA (1) | 2023.05.10 |
시계열 분석 스터디 2주차(김연규): 전통 시계열 모델 (0) | 2023.05.10 |
시계열 분석 스터디 1주차(김연규) (0) | 2023.05.10 |
시계열 스터디 5주차(김희준) prophet 논문 리뷰 (0) | 2023.05.04 |
댓글 영역