繁簡切換您正在訪問的是FX168財經網,本網站所提供的內容及信息均遵守中華人民共和國香港特別行政區當地法律法規。

FX168财经网>人物频道>帖子

机器学习用于大盘指数预测

作者/外汇老老法师 2019-06-17 00:35 0 来源: FX168财经网人物频道

运用多个因子综合进行机器学习预测

import pandas as pd
import talib as tl
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.svm import LinearSVC, SVC
from sklearn.grid_search import GridSearchCV

from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score


##sma
def SMA(data, ndays,tag): 
 SMA = pd.Series(pd.rolling_mean(data['close'], ndays), name = tag) 
 data[tag] = SMA
 return data
# SMA(df,5)

##bbands
def BBANDS(data, ndays):

 MA = pd.Series(pd.rolling_mean(data['close'], ndays)) 
 SD = pd.Series(pd.rolling_std(data['close'], ndays))
 b1 = MA + (2 * SD)
 B1 = pd.Series(b1, name = 'Upper BollingerBand') 
 data['ub'] = B1
 b2 = MA - (2 * SD)
 B2 = pd.Series(b2, name = 'Lower BollingerBand') 
#  data = data.join(B2) 
 data['lb'] = B2
 return data
# BBANDS(df,50)

##cci
def CCI(data, ndays): 
 TP = (data['high'] + data['low'] + data['close']) / 3 
 CCI = pd.Series((TP - pd.rolling_mean(TP, ndays)) / (0.015 * pd.rolling_std(TP, ndays)),name = 'CCI') 
#  data = data.join(CCI) 
 data['CCI'] = CCI   
 return data
# CCI(df,20)

##roc
def ROC(data,n):
 N = data['close'].diff(n)
 D = data['close'].shift(n)
 ROC = pd.Series(N/D,name='Rate of Change')
 data['ROC'] = ROC
#  data = data.join(ROC)
 return data
# ROC(df,5)

# Ease of Movement 
def EVM(data, ndays): 
 dm = ((data['high'] + data['low'])/2) - ((data['high'].shift(1) + data['low'].shift(1))/2)
 br = (data['volume'] / 100000000) / ((data['high'] - data['low']))
 EVM = dm / br 
 EVM_MA = pd.Series(pd.rolling_mean(EVM, ndays), name = 'EVM')
 data['EVM']  =  EVM
#  data = data.join(EVM_MA) 
 return data
# EVM(df,14)

# Force Index 
def ForceIndex(data, ndays): 
 FI = pd.Series(data['close'].diff(ndays) * data['volume'], name = 'ForceIndex') 
 data['FI'] = FI
#  data = data.join(FI) 
 return data
# ForceIndex(df,1)
##macd
def MACD(data,short=0,long1=0,mid=0):
    if short==0:
        short=12
    if long1==0:
        long1=26
    if mid==0:
        mid=9
    data['sema']=pd.ewma(data['close'],span=short)
    data['lema']=pd.ewma(data['close'],span=long1)
    data.fillna(0,inplace=True)
    data['macd_dif']=data['sema']-data['lema']
    data['macd_dea']=pd.ewma(data['macd_dif'],span=mid)
#     data['macd']=2*(data['macd_dif']-data['macd_dea'])
    data.fillna(0,inplace=True)
    return data

# MACD(df,0,0,0)

def SMA_CN(close, timeperiod) :
    close = np.nan_to_num(close)
    return reduce(lambda x, y: ((timeperiod - 1) * x + y) / timeperiod, close)

# 同花顺和通达信等软件中的RSI
def RSI_CN(data, timeperiod) :
    close = np.array(data['close'])
    diff = map(lambda x, y : x - y, close[1:], close[:-1])
    diffGt0 = map(lambda x : 0 if x < 0 else x, diff)
    diffABS = map(lambda x : abs(x), diff)
    diff = np.array(diff)
    diffGt0 = np.array(diffGt0)
    diffABS = np.array(diffABS)
    diff = np.append(diff[0], diff)
    diffGt0 = np.append(diffGt0[0], diffGt0)
    diffABS = np.append(diffABS[0], diffABS)
    rsi = map(lambda x : SMA_CN(diffGt0[:x], timeperiod) / SMA_CN(diffABS[:x], timeperiod) * 100
            , range(1, len(diffGt0) + 1) )
    data['RSI'] = rsi
    return data
#RSI_CN(df,14)

##ATR指标主要是用来衡量市场波动的强烈度
def ATR(data,timeperiod):
    close_ATR = np.array(data['close'])
    high_ATR = np.array(data['high'])
    low_ATR = np.array(data['low'])
    atr = tl.ATR(high_ATR, low_ATR, close_ATR, timeperiod)
    data['ATR'] = atr
    return data
# ATR(df,14)

def OBV(data):
    obv = tl.OBV(np.array(data['close']),np.array(data['volume']))
    data['OBV'] = obv
    return data
# OBV(df)

def MOM(data):
    mom = tl.MOM(np.array(data['close']), timeperiod=5)
    data['MOM'] = mom
    return data
# MOM(df)
    

def get_tech_data(df):
    data = df.copy()
    SMA(data,5,'sma_5')
    SMA(data,10,'sma_10')
    SMA(data,20,'sma_20')
    SMA(data,30,'sma_30')
    SMA(data,60,'sma_60')
    BBANDS(data,50)
    MACD(data,0,0,0)
    RSI_CN(data,6)
    CCI(data,20)
    ROC(data,5)
    EVM(data,14)
    ForceIndex(data,1)
    ATR(data,14)
    OBV(data)
    MOM(data)
#     data.drop(columns=['open', 'high','close','low','volume','money'])
    data = data.drop('open', 1)
    data = data.drop('high', 1)
    data = data.drop('close', 1)
    data = data.drop('low', 1)
    data = data.drop('volume', 1)
    data = data.drop('money', 1)
    data = data.drop('sema', 1)
    data = data.drop('lema', 1)

    return data
df = get_price('000300.XSHG', end_date='2019-02-17', frequency='daily', fields=['open','high','close','low', 'volume','money']) 
tech_data = get_tech_data(df)
tech_data.tail(10)

comm_data = pd.DataFrame(index = df.index)
comm_data.head()

for c in ['open','high','low','volume']:
    for p in [1,2,3]:
       comm_data[c+"diff"+str(p)]=(df[c] - df[c].shift(p)) / df[c].shift(p)
    
comm_data.tail()

##窗口差异
ml_datas = pd.DataFrame(index = df.index)
for w in [5,10,20,30,60]:
    for c in comm_data.columns:
        ml_datas[c+"_win_"+str(w)] = comm_data[c] / (pd.Series(comm_data[c]).rolling(window=w,center=False).max() - comm_data[c].rolling(window=w,center=False).min())
        
# ml_datas.tail(10)   

##构建机器学习数据集
ml_datas = ml_datas.join(tech_data)
##关键一步,将数据左移1天
ml_datas = ml_datas.shift(1)
##明天的收盘价
ml_datas['reg_target'] = df['close']
##明天相比当天的涨跌
ml_datas['clf_target'] = (df['close']/df['close'].shift(1)) - 1 > 0
ml_datas.tail(10)
ml_datas[['sma_10','reg_target','clf_target']].tail(10)

ml_datas = ml_datas.dropna()
ml_datas.describe()


X_ori = ml_datas.drop(['reg_target','clf_target','OBV'],axis = 1)
X_ori.describe()

##当天的收盘价相比昨天的收盘价上涨还是下跌
y = ml_datas['clf_target']
y.describe()

scaler = preprocessing.StandardScaler().fit(X_ori)
X = scaler.transform(X_ori)
X[:10,:]

##Build a forerest and compute feature importance
forest = ExtraTreesClassifier(n_estimators = 250,random_state = 0)
forest.fit(X,y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis= 0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(len(indices)):
    print("%d. feature %s (%f)" % (f+1,X_ori.columns[indices[f]],importances[indices[f]]))
    
indices = indices[:20]

plt.figure(figsize=(16,9))
plt.title("feature importance")
plt.bar(range(len(indices)),importances[indices],color='r',yerr=std[indices],align='center')
plt.xticks(range(len(indices)),indices)
plt.xlim([-1,len(indices)])
plt.show()

###对沪深300指数进行预测分类
start = '2017-01-01'
X_train = X_ori[X_ori.index<start]
X_test = X_ori[X_ori.index>=start]
y_train = y[y.index<start]
y_test = y[y.index>=start]
print X_train.shape,y_train.shape,X_test.shape,y_test.shape

##基线
# model = LinearRegression()
# model.fit(X_train,y_train)
# y_pred = model.predict(X_test)
#注意结果是bool值
# score = r2_score(y_pred,y_test)
# print 'LinearRegression Score:',confusion_matrix(y_pred, y_test)

models = [("LR", LogisticRegression()), 
              ("LDA", LDA()), 
              ("QDA", QDA()),
              ("LSVC", LinearSVC()),
              ("RSVM", SVC(
              	C=1000000.0, cache_size=200, class_weight=None,
                coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
                max_iter=-1, probability=False, random_state=None,
                shrinking=True, tol=0.001, verbose=False)
              ),
              ("RF", RandomForestClassifier(
              	n_estimators=1000, criterion='gini', 
                max_depth=None, min_samples_split=2, 
                min_samples_leaf=1, max_features='auto', 
                bootstrap=True, oob_score=False, n_jobs=1, 
                random_state=None, verbose=0)
              )]

    # Iterate through the models
for m in models:

    # Train each of the models on the training set
    m[1].fit(X_train, y_train)

    # Make an array of predictions on the test set
    pred = m[1].predict(X_test)

    # Output the hit-rate and the confusion matrix for each model
    print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
    print("%s\n" % confusion_matrix(pred, y_test))
#     print("%s\n" % r2_score(y_test,pred))

    
##网格搜索+交叉验证
# tuned_parameters = [
#     {'n_estimators': [500,1000],'min_samples_split':[5],'min_samples_leaf':[1]}
# ]
# model = GridSearchCV(RandomForestClassifier(),tuned_parameters,cv=10)
# model.fit(X_train, y_train)

# print("Optimised parameters found on training set:")
# print(model.best_estimator_, "\n")

# print("Grid scores calculated on training set:")
# for params, mean_score, scores in model.grid_scores_:
#     print("%0.3f for %r" % (mean_score, params))
    
##回归
X_ori = ml_datas.drop(['reg_target','clf_target','OBV'],axis=1)
y = ml_datas['reg_target']
start = '2017-01-01'
X_train = X_ori[X_ori.index<start]
X_test = X_ori[X_ori.index>=start]
y_train = y[y.index<start]
y_test = y[y.index>=start]
tuned_parameters = [
    {'alpha':[1,0.5,0.1,0.01,0.001]}
]
##岭回归
model = GridSearchCV(Ridge(),tuned_parameters,cv=10)
model.fit(X_train, y_train)

print("Optimised parameters found on training set:")
print(model.best_estimator_, "\n")

print("Grid scores calculated on training set:")
for params, mean_score, scores in model.grid_scores_:
    print("%0.3f for %r" % (mean_score, params))
    
y_pred = model.predict(X_test)
print("tomorow close is %s,current date is %s" % (y_pred[-1],X_test.index[-1]))
print("R2_sore",r2_score(y_test,y_pred))

df_result = pd.DataFrame(index = y_test.index)

df_result['True value'] = y_test
df_result['Pred value'] = y_pred
df_result.plot(figsize=(16,9))
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=5,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=10,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=30,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=60,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:30: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=50,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:31: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=50,center=False).std()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:87: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=12,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:88: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=26,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:91: FutureWarning: pd.ewm_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.ewm(ignore_na=False,span=9,min_periods=0,adjust=True).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:45: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).mean()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:45: FutureWarning: pd.rolling_std is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=20,center=False).std()
/opt/conda/envs/python2new/lib/python2.7/site-packages/ipykernel_launcher.py:66: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=14,center=False).mean()
 Feature ranking:
1. feature volumediff3_win_60 (0.015575)
2. feature volumediff3_win_30 (0.015485)
3. feature CCI (0.015274)
4. feature RSI (0.015049)
5. feature volumediff3_win_20 (0.014921)
6. feature volumediff1_win_5 (0.014855)
7. feature opendiff1_win_5 (0.014708)
8. feature ROC (0.014554)
9. feature volumediff2_win_30 (0.014312)
10. feature volumediff3_win_10 (0.014244)
11. feature MOM (0.014226)
12. feature volumediff3_win_5 (0.014218)
13. feature lowdiff1_win_10 (0.014178)
14. feature volumediff2_win_10 (0.014170)
15. feature volumediff2_win_5 (0.014016)
16. feature highdiff1_win_20 (0.013877)
17. feature lowdiff2_win_5 (0.013800)
18. feature volumediff1_win_60 (0.013672)
19. feature opendiff3_win_30 (0.013641)
20. feature volumediff1_win_30 (0.013517)
21. feature highdiff3_win_10 (0.013482)
22. feature volumediff1_win_20 (0.013421)
23. feature lowdiff1_win_20 (0.013404)
24. feature sma_30 (0.013394)
25. feature volumediff2_win_20 (0.013388)
26. feature macd_dea (0.013337)
27. feature volumediff2_win_60 (0.013328)
28. feature lowdiff3_win_5 (0.013268)
29. feature volumediff1_win_10 (0.013261)
30. feature lowdiff1_win_5 (0.013232)
31. feature opendiff2_win_10 (0.013123)
32. feature opendiff1_win_20 (0.013101)
33. feature highdiff1_win_5 (0.013055)
34. feature opendiff1_win_30 (0.013052)
35. feature highdiff3_win_60 (0.013029)
36. feature highdiff2_win_20 (0.013020)
37. feature EVM (0.013013)
38. feature highdiff1_win_60 (0.012998)
39. feature opendiff2_win_5 (0.012990)
40. feature highdiff3_win_20 (0.012986)
41. feature highdiff3_win_5 (0.012982)
42. feature highdiff1_win_30 (0.012960)
43. feature opendiff1_win_60 (0.012959)
44. feature highdiff3_win_30 (0.012924)
45. feature lowdiff2_win_60 (0.012850)
46. feature opendiff3_win_20 (0.012828)
47. feature highdiff1_win_10 (0.012821)
48. feature lowdiff3_win_10 (0.012809)
49. feature highdiff2_win_10 (0.012791)
50. feature opendiff3_win_5 (0.012779)
51. feature ub (0.012749)
52. feature lowdiff3_win_60 (0.012702)
53. feature sma_60 (0.012695)
54. feature lowdiff3_win_20 (0.012667)
55. feature lowdiff3_win_30 (0.012639)
56. feature sma_20 (0.012623)
57. feature sma_5 (0.012620)
58. feature opendiff1_win_10 (0.012589)
59. feature FI (0.012546)
60. feature lb (0.012427)
61. feature opendiff2_win_20 (0.012389)
62. feature highdiff2_win_60 (0.012388)
63. feature lowdiff2_win_20 (0.012380)
64. feature highdiff2_win_5 (0.012362)
65. feature highdiff2_win_30 (0.012330)
66. feature lowdiff2_win_10 (0.012264)
67. feature opendiff3_win_60 (0.012256)
68. feature lowdiff1_win_30 (0.012184)
69. feature macd_dif (0.012112)
70. feature sma_10 (0.012108)
71. feature ATR (0.012101)
72. feature opendiff2_win_30 (0.011868)
73. feature lowdiff2_win_30 (0.011766)
74. feature opendiff2_win_60 (0.011731)
75. feature lowdiff1_win_60 (0.011581)
76. feature opendiff3_win_10 (0.011045)
(425, 76) (425,) (515, 76) (515,)
LR:
0.517
[[122 124]
 [125 144]]

LDA:
0.476
[[159 182]
 [ 88  86]]

QDA:
0.485
[[100 118]
 [147 150]]

LSVC:
0.483
[[125 144]
 [122 124]]

RSVM:
0.520
[[  0   0]
 [247 268]]

RF:
0.534
[[117 110]
 [130 158]]

Optimised parameters found on training set:
(Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001), '\n')
Grid scores calculated on training set:
0.648 for {'alpha': 1}
0.628 for {'alpha': 0.5}
0.571 for {'alpha': 0.1}
0.485 for {'alpha': 0.01}
0.445 for {'alpha': 0.001}
tomorow close is 3367.07625415,current date is 2019-02-15 00:00:00
('R2_sore', 0.96953680773848738)
<matplotlib.axes._subplots.AxesSubplot at 0x7f45346c9210>
 
 
 
 
分享到:
举报财经168客户端下载

全部回复

0/140

投稿 您想发表你的观点和看法?

更多人气分析师

  • 张亦巧

    人气2152文章4145粉丝45

    暂无个人简介信息

  • 梁孟梵

    人气2152文章3177粉丝39

    qq:2294906466 了解群指导添加微信mfmacd

  • 指导老师

    人气1856文章4423粉丝52

    暂无个人简介信息

  • 李冉晴

    人气2296文章3821粉丝34

    李冉晴,专业现贷实盘分析师。

  • 刘钥钥1

    人气2016文章3119粉丝34

    专业从事现货黄金、现货白银模似实盘操作分析指导

  • 张迎妤

    人气1896文章3305粉丝34

    个人专注于行情技术分析,消息面解读剖析,给予您第一时间方向...

  • 金泰铬J

    人气2320文章3925粉丝51

    投资问答解咨询金泰铬V/信tgtg67即可获取每日的实时资讯、行情...

  • 金算盘

    人气2696文章7761粉丝125

    高级分析师,混过名校,厮杀于股市和期货、证券市场多年,专注...

  • 金帝财神

    人气4736文章8329粉丝118

    本文由资深分析师金帝财神微信:934295330,指导黄金,白银,...