import pandas as pd
import talib as tl
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.svm import LinearSVC, SVC
from sklearn.grid_search import GridSearchCV

from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score


##sma
def SMA(data, ndays,tag): 
 SMA = pd.Series(pd.rolling_mean(data['close'], ndays), name = tag) 
 data[tag] = SMA
 return data
# SMA(df,5)

##bbands
def BBANDS(data, ndays):

 MA = pd.Series(pd.rolling_mean(data['close'], ndays)) 
 SD = pd.Series(pd.rolling_std(data['close'], ndays))
 b1 = MA + (2 * SD)
 B1 = pd.Series(b1, name = 'Upper BollingerBand') 
 data['ub'] = B1
 b2 = MA - (2 * SD)
 B2 = pd.Series(b2, name = 'Lower BollingerBand') 
#  data = data.join(B2) 
 data['lb'] = B2
 return data
# BBANDS(df,50)

##cci
def CCI(data, ndays): 
 TP = (data['high'] + data['low'] + data['close']) / 3 
 CCI = pd.Series((TP - pd.rolling_mean(TP, ndays)) / (0.015 * pd.rolling_std(TP, ndays)),name = 'CCI') 
#  data = data.join(CCI) 
 data['CCI'] = CCI   
 return data
# CCI(df,20)

##roc
def ROC(data,n):
 N = data['close'].diff(n)
 D = data['close'].shift(n)
 ROC = pd.Series(N/D,name='Rate of Change')
 data['ROC'] = ROC
#  data = data.join(ROC)
 return data
# ROC(df,5)

# Ease of Movement 
def EVM(data, ndays): 
 dm = ((data['high'] + data['low'])/2) - ((data['high'].shift(1) + data['low'].shift(1))/2)
 br = (data['volume'] / 100000000) / ((data['high'] - data['low']))
 EVM = dm / br 
 EVM_MA = pd.Series(pd.rolling_mean(EVM, ndays), name = 'EVM')
 data['EVM']  =  EVM
#  data = data.join(EVM_MA) 
 return data
# EVM(df,14)

# Force Index 
def ForceIndex(data, ndays): 
 FI = pd.Series(data['close'].diff(ndays) * data['volume'], name = 'ForceIndex') 
 data['FI'] = FI
#  data = data.join(FI) 
 return data
# ForceIndex(df,1)
##macd
def MACD(data,short=0,long1=0,mid=0):
    if short==0:
        short=12
    if long1==0:
        long1=26
    if mid==0:
        mid=9
    data['sema']=pd.ewma(data['close'],span=short)
    data['lema']=pd.ewma(data['close'],span=long1)
    data.fillna(0,inplace=True)
    data['macd_dif']=data['sema']-data['lema']
    data['macd_dea']=pd.ewma(data['macd_dif'],span=mid)
#     data['macd']=2*(data['macd_dif']-data['macd_dea'])
    data.fillna(0,inplace=True)
    return data

# MACD(df,0,0,0)

def SMA_CN(close, timeperiod) :
    close = np.nan_to_num(close)
    return reduce(lambda x, y: ((timeperiod - 1) * x + y) / timeperiod, close)

# 同花顺和通达信等软件中的RSI
def RSI_CN(data, timeperiod) :
    close = np.array(data['close'])
    diff = map(lambda x, y : x - y, close[1:], close[:-1])
    diffGt0 = map(lambda x : 0 if x < 0 else x, diff)
    diffABS = map(lambda x : abs(x), diff)
    diff = np.array(diff)
    diffGt0 = np.array(diffGt0)
    diffABS = np.array(diffABS)
    diff = np.append(diff[0], diff)
    diffGt0 = np.append(diffGt0[0], diffGt0)
    diffABS = np.append(diffABS[0], diffABS)
    rsi = map(lambda x : SMA_CN(diffGt0[:x], timeperiod) / SMA_CN(diffABS[:x], timeperiod) * 100
            , range(1, len(diffGt0) + 1) )
    data['RSI'] = rsi
    return data
#RSI_CN(df,14)

##ATR指标主要是用来衡量市场波动的强烈度
def ATR(data,timeperiod):
    close_ATR = np.array(data['close'])
    high_ATR = np.array(data['high'])
    low_ATR = np.array(data['low'])
    atr = tl.ATR(high_ATR, low_ATR, close_ATR, timeperiod)
    data['ATR'] = atr
    return data
# ATR(df,14)

def OBV(data):
    obv = tl.OBV(np.array(data['close']),np.array(data['volume']))
    data['OBV'] = obv
    return data
# OBV(df)

def MOM(data):
    mom = tl.MOM(np.array(data['close']), timeperiod=5)
    data['MOM'] = mom
    return data
# MOM(df)
    

def get_tech_data(df):
    data = df.copy()
    SMA(data,5,'sma_5')
    SMA(data,10,'sma_10')
    SMA(data,20,'sma_20')
    SMA(data,30,'sma_30')
    SMA(data,60,'sma_60')
    BBANDS(data,50)
    MACD(data,0,0,0)
    RSI_CN(data,6)
    CCI(data,20)
    ROC(data,5)
    EVM(data,14)
    ForceIndex(data,1)
    ATR(data,14)
    OBV(data)
    MOM(data)
#     data.drop(columns=['open', 'high','close','low','volume','money'])
    data = data.drop('open', 1)
    data = data.drop('high', 1)
    data = data.drop('close', 1)
    data = data.drop('low', 1)
    data = data.drop('volume', 1)
    data = data.drop('money', 1)
    data = data.drop('sema', 1)
    data = data.drop('lema', 1)

    return data
df = get_price('000300.XSHG', end_date='2019-02-17', frequency='daily', fields=['open','high','close','low', 'volume','money']) 
tech_data = get_tech_data(df)
tech_data.tail(10)

comm_data = pd.DataFrame(index = df.index)
comm_data.head()

for c in ['open','high','low','volume']:
    for p in [1,2,3]:
       comm_data[c+"diff"+str(p)]=(df[c] - df[c].shift(p)) / df[c].shift(p)
    
comm_data.tail()

##窗口差异
ml_datas = pd.DataFrame(index = df.index)
for w in [5,10,20,30,60]:
    for c in comm_data.columns:
        ml_datas[c+"_win_"+str(w)] = comm_data[c] / (pd.Series(comm_data[c]).rolling(window=w,center=False).max() - comm_data[c].rolling(window=w,center=False).min())
        
# ml_datas.tail(10)   

##构建机器学习数据集
ml_datas = ml_datas.join(tech_data)
##关键一步，将数据左移1天
ml_datas = ml_datas.shift(1)
##明天的收盘价
ml_datas['reg_target'] = df['close']
##明天相比当天的涨跌
ml_datas['clf_target'] = (df['close']/df['close'].shift(1)) - 1 > 0
ml_datas.tail(10)
ml_datas[['sma_10','reg_target','clf_target']].tail(10)

ml_datas = ml_datas.dropna()
ml_datas.describe()


X_ori = ml_datas.drop(['reg_target','clf_target','OBV'],axis = 1)
X_ori.describe()

##当天的收盘价相比昨天的收盘价上涨还是下跌
y = ml_datas['clf_target']
y.describe()

scaler = preprocessing.StandardScaler().fit(X_ori)
X = scaler.transform(X_ori)
X[:10,:]

##Build a forerest and compute feature importance
forest = ExtraTreesClassifier(n_estimators = 250,random_state = 0)
forest.fit(X,y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis= 0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(len(indices)):
    print("%d. feature %s (%f)" % (f+1,X_ori.columns[indices[f]],importances[indices[f]]))
    
indices = indices[:20]

plt.figure(figsize=(16,9))
plt.title("feature importance")
plt.bar(range(len(indices)),importances[indices],color='r',yerr=std[indices],align='center')
plt.xticks(range(len(indices)),indices)
plt.xlim([-1,len(indices)])
plt.show()

###对沪深300指数进行预测分类
start = '2017-01-01'
X_train = X_ori[X_ori.index<start]
X_test = X_ori[X_ori.index>=start]
y_train = y[y.index<start]
y_test = y[y.index>=start]
print X_train.shape,y_train.shape,X_test.shape,y_test.shape

##基线
# model = LinearRegression()
# model.fit(X_train,y_train)
# y_pred = model.predict(X_test)
#注意结果是bool值
# score = r2_score(y_pred,y_test)
# print 'LinearRegression Score:',confusion_matrix(y_pred, y_test)

models = [("LR", LogisticRegression()), 
              ("LDA", LDA()), 
              ("QDA", QDA()),
              ("LSVC", LinearSVC()),
              ("RSVM", SVC(
              	C=1000000.0, cache_size=200, class_weight=None,
                coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
                max_iter=-1, probability=False, random_state=None,
                shrinking=True, tol=0.001, verbose=False)
              ),
              ("RF", RandomForestClassifier(
              	n_estimators=1000, criterion='gini', 
                max_depth=None, min_samples_split=2, 
                min_samples_leaf=1, max_features='auto', 
                bootstrap=True, oob_score=False, n_jobs=1, 
                random_state=None, verbose=0)
              )]

    # Iterate through the models
for m in models:

    # Train each of the models on the training set
    m[1].fit(X_train, y_train)

    # Make an array of predictions on the test set
    pred = m[1].predict(X_test)

    # Output the hit-rate and the confusion matrix for each model
    print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
    print("%s\n" % confusion_matrix(pred, y_test))
#     print("%s\n" % r2_score(y_test,pred))

    
##网格搜索+交叉验证
# tuned_parameters = [
#     {'n_estimators': [500,1000],'min_samples_split':[5],'min_samples_leaf':[1]}
# ]
# model = GridSearchCV(RandomForestClassifier(),tuned_parameters,cv=10)
# model.fit(X_train, y_train)

# print("Optimised parameters found on training set:")
# print(model.best_estimator_, "\n")

# print("Grid scores calculated on training set:")
# for params, mean_score, scores in model.grid_scores_:
#     print("%0.3f for %r" % (mean_score, params))
    
##回归
X_ori = ml_datas.drop(['reg_target','clf_target','OBV'],axis=1)
y = ml_datas['reg_target']
start = '2017-01-01'
X_train = X_ori[X_ori.index<start]
X_test = X_ori[X_ori.index>=start]
y_train = y[y.index<start]
y_test = y[y.index>=start]
tuned_parameters = [
    {'alpha':[1,0.5,0.1,0.01,0.001]}
]
##岭回归
model = GridSearchCV(Ridge(),tuned_parameters,cv=10)
model.fit(X_train, y_train)

print("Optimised parameters found on training set:")
print(model.best_estimator_, "\n")

print("Grid scores calculated on training set:")
for params, mean_score, scores in model.grid_scores_:
    print("%0.3f for %r" % (mean_score, params))
    
y_pred = model.predict(X_test)
print("tomorow close is %s,current date is %s" % (y_pred[-1],X_test.index[-1]))
print("R2_sore",r2_score(y_test,y_pred))

df_result = pd.DataFrame(index = y_test.index)

df_result['True value'] = y_test
df_result['Pred value'] = y_pred
df_result.plot(figsize=(16,9))

/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=5,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=10,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=30,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=60,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:30: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=50,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:31: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=50,center=False).std()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:87: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=12,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:88: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=26,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:91: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=9,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:45: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:45: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).std()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:66: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).mean()

 Feature ranking:
1. feature volumediff3_win_60 (0.015575)
2. feature volumediff3_win_30 (0.015485)
3. feature CCI (0.015274)
4. feature RSI (0.015049)
5. feature volumediff3_win_20 (0.014921)
6. feature volumediff1_win_5 (0.014855)
7. feature opendiff1_win_5 (0.014708)
8. feature ROC (0.014554)
9. feature volumediff2_win_30 (0.014312)
10. feature volumediff3_win_10 (0.014244)
11. feature MOM (0.014226)
12. feature volumediff3_win_5 (0.014218)
13. feature lowdiff1_win_10 (0.014178)
14. feature volumediff2_win_10 (0.014170)
15. feature volumediff2_win_5 (0.014016)
16. feature highdiff1_win_20 (0.013877)
17. feature lowdiff2_win_5 (0.013800)
18. feature volumediff1_win_60 (0.013672)
19. feature opendiff3_win_30 (0.013641)
20. feature volumediff1_win_30 (0.013517)
21. feature highdiff3_win_10 (0.013482)
22. feature volumediff1_win_20 (0.013421)
23. feature lowdiff1_win_20 (0.013404)
24. feature sma_30 (0.013394)
25. feature volumediff2_win_20 (0.013388)
26. feature macd_dea (0.013337)
27. feature volumediff2_win_60 (0.013328)
28. feature lowdiff3_win_5 (0.013268)
29. feature volumediff1_win_10 (0.013261)
30. feature lowdiff1_win_5 (0.013232)
31. feature opendiff2_win_10 (0.013123)
32. feature opendiff1_win_20 (0.013101)
33. feature highdiff1_win_5 (0.013055)
34. feature opendiff1_win_30 (0.013052)
35. feature highdiff3_win_60 (0.013029)
36. feature highdiff2_win_20 (0.013020)
37. feature EVM (0.013013)
38. feature highdiff1_win_60 (0.012998)
39. feature opendiff2_win_5 (0.012990)
40. feature highdiff3_win_20 (0.012986)
41. feature highdiff3_win_5 (0.012982)
42. feature highdiff1_win_30 (0.012960)
43. feature opendiff1_win_60 (0.012959)
44. feature highdiff3_win_30 (0.012924)
45. feature lowdiff2_win_60 (0.012850)
46. feature opendiff3_win_20 (0.012828)
47. feature highdiff1_win_10 (0.012821)
48. feature lowdiff3_win_10 (0.012809)
49. feature highdiff2_win_10 (0.012791)
50. feature opendiff3_win_5 (0.012779)
51. feature ub (0.012749)
52. feature lowdiff3_win_60 (0.012702)
53. feature sma_60 (0.012695)
54. feature lowdiff3_win_20 (0.012667)
55. feature lowdiff3_win_30 (0.012639)
56. feature sma_20 (0.012623)
57. feature sma_5 (0.012620)
58. feature opendiff1_win_10 (0.012589)
59. feature FI (0.012546)
60. feature lb (0.012427)
61. feature opendiff2_win_20 (0.012389)
62. feature highdiff2_win_60 (0.012388)
63. feature lowdiff2_win_20 (0.012380)
64. feature highdiff2_win_5 (0.012362)
65. feature highdiff2_win_30 (0.012330)
66. feature lowdiff2_win_10 (0.012264)
67. feature opendiff3_win_60 (0.012256)
68. feature lowdiff1_win_30 (0.012184)
69. feature macd_dif (0.012112)
70. feature sma_10 (0.012108)
71. feature ATR (0.012101)
72. feature opendiff2_win_30 (0.011868)
73. feature lowdiff2_win_30 (0.011766)
74. feature opendiff2_win_60 (0.011731)
75. feature lowdiff1_win_60 (0.011581)
76. feature opendiff3_win_10 (0.011045)

(425, 76) (425,) (515, 76) (515,)
LR:
0.517
[[122 124]
 [125 144]]

LDA:
0.476
[[159 182]
 [ 88  86]]

QDA:
0.485
[[100 118]
 [147 150]]

LSVC:
0.483
[[125 144]
 [122 124]]

RSVM:
0.520
[[  0   0]
 [247 268]]

RF:
0.534
[[117 110]
 [130 158]]

Optimised parameters found on training set:
(Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001), '\n')
Grid scores calculated on training set:
0.648 for {'alpha': 1}
0.628 for {'alpha': 0.5}
0.571 for {'alpha': 0.1}
0.485 for {'alpha': 0.01}
0.445 for {'alpha': 0.001}
tomorow close is 3367.07625415,current date is 2019-02-15 00:00:00
('R2_sore', 0.96953680773848738)

<matplotlib.axes._subplots.AxesSubplot at 0x7f45346c9210>

量化交易吧 / 数理科学 帖子：3370860 新帖：23

机器学习用于大盘指数预测

外汇老老法师发表于：6 月 17 日 00：35回复(1)

全部回复

0/140

粉丝:734

帖子数:0

粉丝:555

帖子数:0

粉丝:565

帖子数:0

量化课程

热门标签

删除回复

确认要删除这篇文章么？

举报用户

信息提示

该文章已删除

设置置顶

完成设置【置顶】！

设置置顶

已取消设置【置顶】！

设置精华

完成设置【精华】！

设置精华

已取消设置【精华】！

审核信息

该文章已审核通过

审核信息

您已设置该文章审核不通过

举报成功

您已举报成功

用户登录

移动帖子

创建私信

屏蔽提示

确认要屏蔽该用户么？

屏蔽回复

您已对该用户实现屏蔽

信息回复

已发送成功

量化交易吧 / 数理科学帖子：3370860 新帖：23