From 7b017a4f3c89aee83253b1ed46b5f32f1856d8ac Mon Sep 17 00:00:00 2001 From: darrenlee Date: Tue, 4 Aug 2020 16:11:39 +0900 Subject: [PATCH 1/5] StockAnalysis within overture --- .idea/.gitignore | 2 ++ .idea/StockAnalysisInPython.iml | 8 ++++++++ .idea/inspectionProfiles/profiles_settings.xml | 6 ++++++ .idea/misc.xml | 6 ++++++ .idea/modules.xml | 8 ++++++++ .idea/vcs.xml | 6 ++++++ 6 files changed, 36 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/StockAnalysisInPython.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..5c98b42 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,2 @@ +# Default ignored files +/workspace.xml \ No newline at end of file diff --git a/.idea/StockAnalysisInPython.iml b/.idea/StockAnalysisInPython.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/StockAnalysisInPython.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..28a804d --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..3f2e96a --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file From b7197b9a21fcdc9e198b6bb5761cc9b4b780d5c9 Mon Sep 17 00:00:00 2001 From: darrenlee Date: Thu, 10 Sep 2020 23:53:05 +0900 Subject: [PATCH 2/5] commit test code --- testCode/eda1.py | 147 +++++++++++++++++++++++++++++++++++++++ testCode/stockpriceDB.py | 20 ++++++ 2 files changed, 167 insertions(+) create mode 100644 testCode/eda1.py create mode 100644 testCode/stockpriceDB.py diff --git a/testCode/eda1.py b/testCode/eda1.py new file mode 100644 index 0000000..f1c653d --- /dev/null +++ b/testCode/eda1.py @@ -0,0 +1,147 @@ +import pandas as pd +import numpy as np +import re +from collections import Counter + + +movies_col =['movie_id','movie_name','genre'] +movies = pd.read_table('/Users/darrenlee/PycharmProjects/overture/data/pfda/ml-1m/movies.dat',sep = '::',header= None,names = movies_col, engine = 'python') + +ratings_col =['user_id','movie_id','rating','timestamp'] +ratings = pd.read_table('/Users/darrenlee/PycharmProjects/overture/data/pfda/ml-1m/ratings.dat',sep = '::',header= None,names = ratings_col, engine = 'python') + +users_col = ['user_id','gender','age','occupation','zip_code'] +users = pd.read_table('/Users/darrenlee/PycharmProjects/overture/data/pfda/ml-1m/users.dat',sep = '::',header= None,names= users_col, engine = 'python') + +mv_rt = pd.merge(movies, ratings, on = "movie_id", how = "left") + +mv_rating = pd.merge(mv_rt, users, on = "user_id", how = 'left' ) + + +occupation_dict = {0: "other" +,1: "academic/educator" +,2: "artist" +,3: "clerical/admin" +,4: "college/grad student" +,5: "customer service" +,6: "doctor/health care" +,7: "executive/managerial" +,8: "farmer" +,9: "homemaker" +,10: "K-12 student" +,11: "lawyer" +,12: "programmer" +,13: "retired" +,14: "sales/marketing" +,15: "scientist" +,16: "self-employed" +,17: "technician/engineer" +,18: "tradesman/craftsman" +,19: "unemployed" +,20: "writer"} + +# occupation title merge +occupation_df = pd.DataFrame.from_dict(occupation_dict,orient='index', columns =["occupation_title"]).reset_index() +mv_rating = pd.merge(mv_rating, occupation_df, left_on = 'occupation' , right_on = 'index', how = 'left') + + +bins = [0,18,24,34,44,55,100] +labels = ['Under 18', '18-24', '25-34', '35-44', '45-55','56+'] + +mv_rating['age_range'] = pd.cut(x=mv_rating['age'], bins=bins, + labels= labels, include_lowest= True) + +mv_rating[['age','age_range']] + + +#4 mv_rating의 영화 제목 컬럼에서 개봉 연도를 분리해서 추가 컬럼을 생성해 주세요. +# mv_rating["year"] + +def split_it(year): + return re.findall('\(.*?\)', year) + + +mv_rating["year"] = mv_rating.movie_name.apply(lambda x : re.findall('(?<=\()\d+', x)[0]) + +mv_rating + +#5 전체 영화의 개수와 평균 평점, 평점 개수를 구해 출력해 주세요. + +# 전체 영화 갯수 +total_movie_num = mv_rating.movie_name.nunique() + +#5 +# 평균 평점 +avg_rating = mv_rating.rating.mean() + +# 평균 평점 갯수 +avg_rating_by_mv = mv_rating["movie_name"].value_counts().mean() + +print(f"총 영화 갯수 :{total_movie_num} \n 평균 평점 :{avg_rating:.2f} \n 영화 별 평균 평점 갯수: {avg_rating_by_mv:.2f}") + + +#6 영화별 평균 평점/ 평점 갯수 구하고 dataframe칼럼으로 추가 +movies_info = pd.DataFrame() + +func_list = ["size","mean"] + +movies_info = mv_rating[["movie_name","rating"]].groupby(mv_rating["movie_name"]).agg(func_list) +movies_info.columns = movies_info.columns.droplevel(0) +movies_info.reset_index(inplace= True) + +movies_info = movies_info.rename(columns = {'' : "movieName","size": "numberOfReviews", "mean":"averageRating"}) + +# 평균 평점이 가장 높은 영화 TOP10과 가장 낮은 WORST10을 구해 출력해 주세요. +movies_info.sort_values(by = ["averageRating","numberOfReviews"], ascending = False).head(10) +movies_info.sort_values(by = ["averageRating","numberOfReviews"], ascending = True).head(10) + +# User 당 평균 평점 개수를 구해 출력해 주시고, 평점을 가장 많이 남긴 User TOP10을 출력해 주세요. +mv_rating.columns +# 평균 평점 갯수 +mv_rating.groupby(mv_rating["user_id"]).agg("size").mean() +# top 10 number of reviews +mv_rating.groupby(mv_rating["user_id"]).agg("size").sort_values(ascending=False).head(10) + +# mv_rating의 장르 컬럼에 들어갈 수 있는 장르를 별도의 데이터 프레임으로 생성해주세요.데이터 프레임 이름은 mv_genre_list로 지정해 주세요. +mv_genre_list = list(mv_rating["genre"].str.split("|", expand =True).unstack().unique()) + + + +#10. mv_genre_list를 활용하여 각 장르에 포함되는 영화의 개수와 평균 평점, 장르별 평점 개수를 구해 출력해 주세요. +mv_genre_df = pd.DataFrame(mv_genre_list, columns= ["genre"]) + +# movies_info_genre = pd.merge(movies_info ,mv_rating[["movie_name","genre"]], on = "movie_name" , how = "left") +movies_info = pd.merge(movies_info, movies, on = "movie_name", how = "inner") + +# 장르별 영화 갯수 +movieNumByGenre = movies_info["genre"].str.split("|", expand = True).unstack().value_counts() + +# 장르 별 평균 평점 +genre_unstack = mv_rating["genre"].str.split("|", expand = True) +tmp_df = genre_unstack.rename(columns = {0:"genre1",1:"genre2",2:"genre3",3:"genre4",4:"genre5",5:"genre6"}) +tmp_df.columns + +mv_rating2 = pd.concat([mv_rating, tmp_df], axis = 1) +# melt + +melted_rating2 = mv_rating2.melt(id_vars =['movie_id', 'movie_name', 'genre', 'user_id', 'rating', 'timestamp', + 'gender', 'age', 'occupation', 'zip_code', 'index', 'occupation_title', + 'age_range', 'year'], value_vars = ['genre1', 'genre2', 'genre3', 'genre4', 'genre5', + 'genre6'] , var_name = ["genre"]) + + +# drop rows with None (genre columns) +melted_rating2 = melted_rating2.dropna(subset = ['value'],axis = 0) + +# value counts +mv_rating[["movie_name","rating"]].groupby(mv_rating["movie_name"]).agg(func_list) + +rating_by_genre = melted_rating2[["movie_name","rating"]].groupby(melted_rating2["value"]).agg(func_list) +rating_by_genre = rating_by_genre.reset_index() +rating_by_genre + + +# 장르 별 평점 갯수 +mv_rating_df = mv_rating["genre"].str.split("|", expand = True).unstack().value_counts() + + diff --git a/testCode/stockpriceDB.py b/testCode/stockpriceDB.py new file mode 100644 index 0000000..2233231 --- /dev/null +++ b/testCode/stockpriceDB.py @@ -0,0 +1,20 @@ +from pandas_datareader import data as pdr +import yfinance as yf + +import matplotlib.pyplot as plt + +yf.pdr_override() + +# samsung electronics +sec = pdr.get_data_yahoo('005930.KS', start = '2018-05-04') +# microsoft +msft = pdr.get_data_yahoo("MSFT",start = '2018-05-04') +amzn = pdr.get_data_yahoo("AMZN", start = '2018-05-04') + +plt.plot(sec.index, sec.Close, 'b', label ="Samsung Electronics") +plt.plot(amzn.index, amzn.Close, 'r--', label ="Amazon") +plt.plot(msft.index, msft.Close, 'g--', label = "Microsoft") + +plt.legend(loc = 'best') +plt.show() + From c0d73c1052ced2799816a49995ef89b3f0e0e8a5 Mon Sep 17 00:00:00 2001 From: darrenlee Date: Thu, 10 Sep 2020 23:58:45 +0900 Subject: [PATCH 3/5] test commmit --- .idea/StockAnalysisInPython.iml | 11 +++++++++++ .idea/inspectionProfiles/profiles_settings.xml | 6 ++++++ .idea/misc.xml | 7 +++++++ .idea/modules.xml | 8 ++++++++ .idea/other.xml | 7 +++++++ .idea/vcs.xml | 6 ++++++ README.md | 1 + 7 files changed, 46 insertions(+) create mode 100644 .idea/StockAnalysisInPython.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/other.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/StockAnalysisInPython.iml b/.idea/StockAnalysisInPython.iml new file mode 100644 index 0000000..78f8be5 --- /dev/null +++ b/.idea/StockAnalysisInPython.iml @@ -0,0 +1,11 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..ae7c9ee --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..3f2e96a --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/other.xml b/.idea/other.xml new file mode 100644 index 0000000..640fd80 --- /dev/null +++ b/.idea/other.xml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index 3cdcd0d..7439813 100644 --- a/README.md +++ b/README.md @@ -9,3 +9,4 @@ - 서적에 삽입된 그림의 PPT 원본은 PowerPoint_Materials.pptx 파일에 있습니다. ![Portpolio_optimization](./06_Trading_Strategy/imgs/Portpolio_optimization.jpg) + From a2b05b3de0f320b9ac2df197cc60cf6b5151bc2b Mon Sep 17 00:00:00 2001 From: darrenlee Date: Fri, 11 Sep 2020 00:09:10 +0900 Subject: [PATCH 4/5] test commmit --- testCode/eda1.py | 147 --------------------------------------- testCode/stockpriceDB.py | 20 ------ 2 files changed, 167 deletions(-) delete mode 100644 testCode/eda1.py delete mode 100644 testCode/stockpriceDB.py diff --git a/testCode/eda1.py b/testCode/eda1.py deleted file mode 100644 index f1c653d..0000000 --- a/testCode/eda1.py +++ /dev/null @@ -1,147 +0,0 @@ -import pandas as pd -import numpy as np -import re -from collections import Counter - - -movies_col =['movie_id','movie_name','genre'] -movies = pd.read_table('/Users/darrenlee/PycharmProjects/overture/data/pfda/ml-1m/movies.dat',sep = '::',header= None,names = movies_col, engine = 'python') - -ratings_col =['user_id','movie_id','rating','timestamp'] -ratings = pd.read_table('/Users/darrenlee/PycharmProjects/overture/data/pfda/ml-1m/ratings.dat',sep = '::',header= None,names = ratings_col, engine = 'python') - -users_col = ['user_id','gender','age','occupation','zip_code'] -users = pd.read_table('/Users/darrenlee/PycharmProjects/overture/data/pfda/ml-1m/users.dat',sep = '::',header= None,names= users_col, engine = 'python') - -mv_rt = pd.merge(movies, ratings, on = "movie_id", how = "left") - -mv_rating = pd.merge(mv_rt, users, on = "user_id", how = 'left' ) - - -occupation_dict = {0: "other" -,1: "academic/educator" -,2: "artist" -,3: "clerical/admin" -,4: "college/grad student" -,5: "customer service" -,6: "doctor/health care" -,7: "executive/managerial" -,8: "farmer" -,9: "homemaker" -,10: "K-12 student" -,11: "lawyer" -,12: "programmer" -,13: "retired" -,14: "sales/marketing" -,15: "scientist" -,16: "self-employed" -,17: "technician/engineer" -,18: "tradesman/craftsman" -,19: "unemployed" -,20: "writer"} - -# occupation title merge -occupation_df = pd.DataFrame.from_dict(occupation_dict,orient='index', columns =["occupation_title"]).reset_index() -mv_rating = pd.merge(mv_rating, occupation_df, left_on = 'occupation' , right_on = 'index', how = 'left') - - -bins = [0,18,24,34,44,55,100] -labels = ['Under 18', '18-24', '25-34', '35-44', '45-55','56+'] - -mv_rating['age_range'] = pd.cut(x=mv_rating['age'], bins=bins, - labels= labels, include_lowest= True) - -mv_rating[['age','age_range']] - - -#4 mv_rating의 영화 제목 컬럼에서 개봉 연도를 분리해서 추가 컬럼을 생성해 주세요. -# mv_rating["year"] - -def split_it(year): - return re.findall('\(.*?\)', year) - - -mv_rating["year"] = mv_rating.movie_name.apply(lambda x : re.findall('(?<=\()\d+', x)[0]) - -mv_rating - -#5 전체 영화의 개수와 평균 평점, 평점 개수를 구해 출력해 주세요. - -# 전체 영화 갯수 -total_movie_num = mv_rating.movie_name.nunique() - -#5 -# 평균 평점 -avg_rating = mv_rating.rating.mean() - -# 평균 평점 갯수 -avg_rating_by_mv = mv_rating["movie_name"].value_counts().mean() - -print(f"총 영화 갯수 :{total_movie_num} \n 평균 평점 :{avg_rating:.2f} \n 영화 별 평균 평점 갯수: {avg_rating_by_mv:.2f}") - - -#6 영화별 평균 평점/ 평점 갯수 구하고 dataframe칼럼으로 추가 -movies_info = pd.DataFrame() - -func_list = ["size","mean"] - -movies_info = mv_rating[["movie_name","rating"]].groupby(mv_rating["movie_name"]).agg(func_list) -movies_info.columns = movies_info.columns.droplevel(0) -movies_info.reset_index(inplace= True) - -movies_info = movies_info.rename(columns = {'' : "movieName","size": "numberOfReviews", "mean":"averageRating"}) - -# 평균 평점이 가장 높은 영화 TOP10과 가장 낮은 WORST10을 구해 출력해 주세요. -movies_info.sort_values(by = ["averageRating","numberOfReviews"], ascending = False).head(10) -movies_info.sort_values(by = ["averageRating","numberOfReviews"], ascending = True).head(10) - -# User 당 평균 평점 개수를 구해 출력해 주시고, 평점을 가장 많이 남긴 User TOP10을 출력해 주세요. -mv_rating.columns -# 평균 평점 갯수 -mv_rating.groupby(mv_rating["user_id"]).agg("size").mean() -# top 10 number of reviews -mv_rating.groupby(mv_rating["user_id"]).agg("size").sort_values(ascending=False).head(10) - -# mv_rating의 장르 컬럼에 들어갈 수 있는 장르를 별도의 데이터 프레임으로 생성해주세요.데이터 프레임 이름은 mv_genre_list로 지정해 주세요. -mv_genre_list = list(mv_rating["genre"].str.split("|", expand =True).unstack().unique()) - - - -#10. mv_genre_list를 활용하여 각 장르에 포함되는 영화의 개수와 평균 평점, 장르별 평점 개수를 구해 출력해 주세요. -mv_genre_df = pd.DataFrame(mv_genre_list, columns= ["genre"]) - -# movies_info_genre = pd.merge(movies_info ,mv_rating[["movie_name","genre"]], on = "movie_name" , how = "left") -movies_info = pd.merge(movies_info, movies, on = "movie_name", how = "inner") - -# 장르별 영화 갯수 -movieNumByGenre = movies_info["genre"].str.split("|", expand = True).unstack().value_counts() - -# 장르 별 평균 평점 -genre_unstack = mv_rating["genre"].str.split("|", expand = True) -tmp_df = genre_unstack.rename(columns = {0:"genre1",1:"genre2",2:"genre3",3:"genre4",4:"genre5",5:"genre6"}) -tmp_df.columns - -mv_rating2 = pd.concat([mv_rating, tmp_df], axis = 1) -# melt - -melted_rating2 = mv_rating2.melt(id_vars =['movie_id', 'movie_name', 'genre', 'user_id', 'rating', 'timestamp', - 'gender', 'age', 'occupation', 'zip_code', 'index', 'occupation_title', - 'age_range', 'year'], value_vars = ['genre1', 'genre2', 'genre3', 'genre4', 'genre5', - 'genre6'] , var_name = ["genre"]) - - -# drop rows with None (genre columns) -melted_rating2 = melted_rating2.dropna(subset = ['value'],axis = 0) - -# value counts -mv_rating[["movie_name","rating"]].groupby(mv_rating["movie_name"]).agg(func_list) - -rating_by_genre = melted_rating2[["movie_name","rating"]].groupby(melted_rating2["value"]).agg(func_list) -rating_by_genre = rating_by_genre.reset_index() -rating_by_genre - - -# 장르 별 평점 갯수 -mv_rating_df = mv_rating["genre"].str.split("|", expand = True).unstack().value_counts() - - diff --git a/testCode/stockpriceDB.py b/testCode/stockpriceDB.py deleted file mode 100644 index 2233231..0000000 --- a/testCode/stockpriceDB.py +++ /dev/null @@ -1,20 +0,0 @@ -from pandas_datareader import data as pdr -import yfinance as yf - -import matplotlib.pyplot as plt - -yf.pdr_override() - -# samsung electronics -sec = pdr.get_data_yahoo('005930.KS', start = '2018-05-04') -# microsoft -msft = pdr.get_data_yahoo("MSFT",start = '2018-05-04') -amzn = pdr.get_data_yahoo("AMZN", start = '2018-05-04') - -plt.plot(sec.index, sec.Close, 'b', label ="Samsung Electronics") -plt.plot(amzn.index, amzn.Close, 'r--', label ="Amazon") -plt.plot(msft.index, msft.Close, 'g--', label = "Microsoft") - -plt.legend(loc = 'best') -plt.show() - From a53d132f6222c8b1d232536a7035771cbc3c0e85 Mon Sep 17 00:00:00 2001 From: darrenlee Date: Sun, 20 Sep 2020 17:52:23 +0900 Subject: [PATCH 5/5] sync 13inch --- 01_Stock_Investment/Investar/Analyzer.py | 93 ++++++++++ 01_Stock_Investment/Investar/DBUpdater.py | 170 ++++++++++++++++++ 01_Stock_Investment/Investar/MarketDB.py | 38 ++++ .../ch03_02_DowKospi_Scatter.py | 2 + README.md | 2 +- stockTest/getData.py | 3 + 6 files changed, 307 insertions(+), 1 deletion(-) create mode 100644 01_Stock_Investment/Investar/Analyzer.py create mode 100644 01_Stock_Investment/Investar/DBUpdater.py create mode 100644 01_Stock_Investment/Investar/MarketDB.py create mode 100644 stockTest/getData.py diff --git a/01_Stock_Investment/Investar/Analyzer.py b/01_Stock_Investment/Investar/Analyzer.py new file mode 100644 index 0000000..1aa7ef1 --- /dev/null +++ b/01_Stock_Investment/Investar/Analyzer.py @@ -0,0 +1,93 @@ +import pandas as pd +import pymysql +from datetime import datetime +from datetime import timedelta +import re + +class MarketDB: + def __init__(self): + """생성자: MariaDB 연결 및 종목코드 딕셔너리 생성""" + self.conn = pymysql.connect(host='localhost', user='root', + password='snake.land.', db='INVESTAR', charset='utf8') + self.codes = {} + self.get_comp_info() + + def __del__(self): + """소멸자: MariaDB 연결 해제""" + self.conn.close() + + def get_comp_info(self): + """company_info 테이블에서 읽어와서 codes에 저장""" + sql = "SELECT * FROM company_info" + krx = pd.read_sql(sql, self.conn) + for idx in range(len(krx)): + self.codes[krx['code'].values[idx]] = krx['company'].values[idx] + + def get_daily_price(self, code, start_date=None, end_date=None): + """KRX 종목의 일별 시세를 데이터프레임 형태로 반환 + - code : KRX 종목코드('005930') 또는 상장기업명('삼성전자') + - start_date : 조회 시작일('2020-01-01'), 미입력 시 1년 전 오늘 + - end_date : 조회 종료일('2020-12-31'), 미입력 시 오늘 날짜 + """ + if start_date is None: + one_year_ago = datetime.today() - timedelta(days=365) + start_date = one_year_ago.strftime('%Y-%m-%d') + print("start_date is initialized to '{}'".format(start_date)) + else: + start_lst = re.split('\D+', start_date) + if start_lst[0] == '': + start_lst = start_lst[1:] + start_year = int(start_lst[0]) + start_month = int(start_lst[1]) + start_day = int(start_lst[2]) + if start_year < 1900 or start_year > 2200: + print(f"ValueError: start_year({start_year:d}) is wrong.") + return + if start_month < 1 or start_month > 12: + print(f"ValueError: start_month({start_month:d}) is wrong.") + return + if start_day < 1 or start_day > 31: + print(f"ValueError: start_day({start_day:d}) is wrong.") + return + start_date=f"{start_year:04d}-{start_month:02d}-{start_day:02d}" + + if end_date is None: + end_date = datetime.today().strftime('%Y-%m-%d') + print("end_date is initialized to '{}'".format(end_date)) + else: + end_lst = re.split('\D+', end_date) + if end_lst[0] == '': + end_lst = end_lst[1:] + end_year = int(end_lst[0]) + end_month = int(end_lst[1]) + end_day = int(end_lst[2]) + if end_year < 1800 or end_year > 2200: + print(f"ValueError: end_year({end_year:d}) is wrong.") + return + if end_month < 1 or end_month > 12: + print(f"ValueError: end_month({end_month:d}) is wrong.") + return + if end_day < 1 or end_day > 31: + print(f"ValueError: end_day({end_day:d}) is wrong.") + return + end_date = f"{end_year:04d}-{end_month:02d}-{end_day:02d}" + + codes_keys = list(self.codes.keys()) + codes_values = list(self.codes.values()) + + if code in codes_keys: + pass + elif code in codes_values: + idx = codes_values.index(code) + code = codes_keys[idx] + else: + print(f"ValueError: Code({code}) doesn't exist.") + sql = f"SELECT * FROM daily_price WHERE code = '{code}'"\ + f" and date >= '{start_date}' and date <= '{end_date}'" + df = pd.read_sql(sql, self.conn) + df.index = df['date'] + return df + + + + diff --git a/01_Stock_Investment/Investar/DBUpdater.py b/01_Stock_Investment/Investar/DBUpdater.py new file mode 100644 index 0000000..3e1e204 --- /dev/null +++ b/01_Stock_Investment/Investar/DBUpdater.py @@ -0,0 +1,170 @@ + +import pandas as pd +from bs4 import BeautifulSoup +import urllib, pymysql, calendar, time, json +from urllib.request import urlopen +from datetime import datetime +from threading import Timer + +class DBUpdater: + def __init__(self): + """생성자: MariaDB 연결 및 종목코드 딕셔너리 생성""" + self.conn = pymysql.connect(host='localhost', user='root', + password='snake.land.', db='INVESTAR', charset='utf8') + + with self.conn.cursor() as curs: + sql = """ + CREATE TABLE IF NOT EXISTS company_info ( + code VARCHAR(20), + company VARCHAR(40), + last_update DATE, + PRIMARY KEY (code)) + """ + curs.execute(sql) + sql = """ + CREATE TABLE IF NOT EXISTS daily_price ( + code VARCHAR(20), + date DATE, + open BIGINT(20), + high BIGINT(20), + low BIGINT(20), + close BIGINT(20), + diff BIGINT(20), + volume BIGINT(20), + PRIMARY KEY (code, date)) + """ + curs.execute(sql) + self.conn.commit() + self.codes = dict() + + def __del__(self): + """소멸자: MariaDB 연결 해제""" + self.conn.close() + + def read_krx_code(self): + """KRX로부터 상장기업 목록 파일을 읽어와서 데이터프레임으로 반환""" + url = 'http://kind.krx.co.kr/corpgeneral/corpList.do?method='\ + 'download&searchType=13' + krx = pd.read_html(url, header=0)[0] + krx = krx[['종목코드', '회사명']] + krx = krx.rename(columns={'종목코드': 'code', '회사명': 'company'}) + krx.code = krx.code.map('{:06d}'.format) + return krx + + def update_comp_info(self): + """종목코드를 company_info 테이블에 업데이트 한 후 딕셔너리에 저장""" + sql = "SELECT * FROM company_info" + df = pd.read_sql(sql, self.conn) + for idx in range(len(df)): + self.codes[df['code'].values[idx]] = df['company'].values[idx] + + with self.conn.cursor() as curs: + sql = "SELECT max(last_update) FROM company_info" + curs.execute(sql) + rs = curs.fetchone() + today = datetime.today().strftime('%Y-%m-%d') + if rs[0] == None or rs[0].strftime('%Y-%m-%d') < today: + krx = self.read_krx_code() + for idx in range(len(krx)): + code = krx.code.values[idx] + company = krx.company.values[idx] + sql = f"REPLACE INTO company_info (code, company, last"\ + f"_update) VALUES ('{code}', '{company}', '{today}')" + curs.execute(sql) + self.codes[code] = company + tmnow = datetime.now().strftime('%Y-%m-%d %H:%M') + print(f"[{tmnow}] #{idx+1:04d} REPLACE INTO company_info "\ + f"VALUES ({code}, {company}, {today})") + self.conn.commit() + print('') + + def read_naver(self, code, company, pages_to_fetch): + """네이버에서 주식 시세를 읽어서 데이터프레임으로 반환""" + try: + url = f"http://finance.naver.com/item/sise_day.nhn?code={code}" + with urlopen(url) as doc: + if doc is None: + return None + html = BeautifulSoup(doc, "lxml") + pgrr = html.find("td", class_="pgRR") + if pgrr is None: + return None + s = str(pgrr.a["href"]).split('=') + lastpage = s[-1] + df = pd.DataFrame() + pages = min(int(lastpage), pages_to_fetch) + for page in range(1, pages + 1): + pg_url = '{}&page={}'.format(url, page) + df = df.append(pd.read_html(pg_url, header=0)[0]) + tmnow = datetime.now().strftime('%Y-%m-%d %H:%M') + print('[{}] {} ({}) : {:04d}/{:04d} pages are downloading...'. + format(tmnow, company, code, page, pages), end="\r") + df = df.rename(columns={'날짜':'date','종가':'close','전일비':'diff' + ,'시가':'open','고가':'high','저가':'low','거래량':'volume'}) + df['date'] = df['date'].replace('.', '-') + df = df.dropna() + df[['close', 'diff', 'open', 'high', 'low', 'volume']] = df[['close', + 'diff', 'open', 'high', 'low', 'volume']].astype(int) + df = df[['date', 'open', 'high', 'low', 'close', 'diff', 'volume']] + except Exception as e: + print('Exception occured :', str(e)) + return None + return df + + def replace_into_db(self, df, num, code, company): + """네이버에서 읽어온 주식 시세를 DB에 REPLACE""" + with self.conn.cursor() as curs: + for r in df.itertuples(): + sql = f"REPLACE INTO daily_price VALUES ('{code}', "\ + f"'{r.date}', {r.open}, {r.high}, {r.low}, {r.close}, "\ + f"{r.diff}, {r.volume})" + curs.execute(sql) + self.conn.commit() + print('[{}] #{:04d} {} ({}) : {} rows > REPLACE INTO daily_'\ + 'price [OK]'.format(datetime.now().strftime('%Y-%m-%d'\ + ' %H:%M'), num+1, company, code, len(df))) + + def update_daily_price(self, pages_to_fetch): + """KRX 상장법인의 주식 시세를 네이버로부터 읽어서 DB에 업데이트""" + for idx, code in enumerate(self.codes): + df = self.read_naver(code, self.codes[code], pages_to_fetch) + if df is None: + continue + self.replace_into_db(df, idx, code, self.codes[code]) + + def execute_daily(self): + """실행 즉시 및 매일 오후 다섯시에 daily_price 테이블 업데이트""" + self.update_comp_info() + + try: + with open('config.json', 'r') as in_file: + config = json.load(in_file) + pages_to_fetch = config['pages_to_fetch'] + except FileNotFoundError: + with open('config.json', 'w') as out_file: + pages_to_fetch = 100 + config = {'pages_to_fetch': 1} + json.dump(config, out_file) + self.update_daily_price(pages_to_fetch) + + tmnow = datetime.now() + lastday = calendar.monthrange(tmnow.year, tmnow.month)[1] + if tmnow.month == 12 and tmnow.day == lastday: + tmnext = tmnow.replace(year=tmnow.year+1, month=1, day=1, + hour=17, minute=0, second=0) + elif tmnow.day == lastday: + tmnext = tmnow.replace(month=tmnow.month+1, day=1, hour=17, + minute=0, second=0) + else: + tmnext = tmnow.replace(day=tmnow.day+1, hour=17, minute=0, + second=0) + tmdiff = tmnext - tmnow + secs = tmdiff.seconds + t = Timer(secs, self.execute_daily) + print("Waiting for next update ({}) ... ".format(tmnext.strftime + ('%Y-%m-%d %H:%M'))) + t.start() + +if __name__ == '__main__': + dbu = DBUpdater() + dbu.execute_daily() diff --git a/01_Stock_Investment/Investar/MarketDB.py b/01_Stock_Investment/Investar/MarketDB.py new file mode 100644 index 0000000..51966df --- /dev/null +++ b/01_Stock_Investment/Investar/MarketDB.py @@ -0,0 +1,38 @@ +import pandas as pd +#from bs4 import BeautifulSoup +#import urllib +#from urllib.request import urlopen +import pymysql +#import time +#import pandas.io.sql as sql +from datetime import datetime +#from threading import Timer +#import matplotlib.pyplot as plt + +class MarketDB: + def __init__(self): + """생성자: MariaDB 연결 및 종목코드 딕셔너리 생성""" + self.conn = pymysql.connect(host='localhost', user='root', password='snake.land.', db='INVESTAR', charset='utf8') + self.codes = dict() + self.getCompanyInfo() + + def __del__(self): + """소멸자: MariaDB 연결 해제""" + self.conn.close() + + def getCompanyInfo(self): + """company_info 테이블에서 읽어와서 companyData와 codes에 저장""" + sql = "SELECT * FROM company_info" + companyInfo = pd.read_sql(sql, self.conn) + for idx in range(len(companyInfo)): + self.codes[companyInfo['code'].values[idx]] = companyInfo['company'].values[idx] + + def getDailyPrice(self, code, startDate, endDate): + """daily_price 테이블에서 읽어와서 데이터프레임으로 반환""" + sql = "SELECT * FROM daily_price WHERE code = '{}' and date >= '{}' and date <= '{}'".format(code, startDate, endDate) + df = pd.read_sql(sql, self.conn) + df.index = df['date'] + return df + + + diff --git a/03_NumPy_and_Pandas/ch03_02_DowKospi_Scatter.py b/03_NumPy_and_Pandas/ch03_02_DowKospi_Scatter.py index 4a733bd..5511f63 100644 --- a/03_NumPy_and_Pandas/ch03_02_DowKospi_Scatter.py +++ b/03_NumPy_and_Pandas/ch03_02_DowKospi_Scatter.py @@ -6,6 +6,8 @@ dow = pdr.get_data_yahoo('^DJI', '2000-01-04') kospi = pdr.get_data_yahoo('^KS11', '2000-01-04') +dow + df = pd.DataFrame({'DOW' dow['Close'], 'KOSPI' kospi['Close']}) df = df.fillna(method='bfill') df = df.fillna(method='ffill') diff --git a/README.md b/README.md index 7439813..4660ec9 100644 --- a/README.md +++ b/README.md @@ -8,5 +8,5 @@ - 서적에 삽입된 그림의 PPT 원본은 PowerPoint_Materials.pptx 파일에 있습니다. -![Portpolio_optimization](./06_Trading_Strategy/imgs/Portpolio_optimization.jpg) +![Portpolio_optimization](06_Trading_Strategy/imgs/Portpolio_optimization.jpg) diff --git a/stockTest/getData.py b/stockTest/getData.py new file mode 100644 index 0000000..5f27803 --- /dev/null +++ b/stockTest/getData.py @@ -0,0 +1,3 @@ +import pandas as pd +import numpy as np +