请 [注册] 或 [登录]  | 返回主站

量化交易吧 /  数理科学 帖子:3129657 新帖:13

基于遗传规划自动挖掘因子

djfjsfj发表于:8 月 31 日 18:59回复(1)

基于遗传规划自动挖掘因子¶

摘要¶

本文基于聚宽提供的开源数据模块jqdatasdk获取标的的行情数据,使用gplearn构建了alpha因子表达式,并采用jqfactor_analyzer对构建的alpha因子进行了单因子分析, 为因子挖掘提供了一条思路。
虽然市面上有流行的alpha191和alpha101,但鲜有文献详细描述构建原理及方法。
本文主要起抛砖引玉的作用,希望了解这部分知识的同学能分享下相关内容,谢谢!

导入所需的模块¶

import numpy as np
import pandas as pd
import graphviz
from scipy.stats import rankdata
import pickle

from gplearn import genetic
from gplearn.functions import make_function
from gplearn.genetic import SymbolicTransformer, SymbolicRegressor
from gplearn.fitness import make_fitness

from sklearn.utils import check_random_state
from sklearn.model_selection import train_test_split

import jqdatasdk as jq
import jqfactor_analyzer as ja

jq.auth('手机号', '密码')

import warnings
warnings.filterwarnings("ignore")
auth success 

使用jqdatasdk获取及处理数据¶

# 设置起止时间
start_date = '2019-01-01'
end_date = '2019-06-01'
fields = ['open', 'high', 'low', 'avg', 'pre_close', 'high_limit','low_limit', 'close']

stock_price = jq.get_price('000001.XSHE', start_date=start_date, end_date=end_date, fq='post', fields=fields)
stock_price['pct'] = stock_price['close'].pct_change(periods=1)
data = stock_price[fields].values
target = stock_price['pct'].values

test_size=0.2
test_num = int(len(data)*test_size)

X_train = data[:-test_num]
X_test = data[-test_num:]
y_train = np.nan_to_num(target[:-test_num])
y_test = np.nan_to_num(target[-test_num:])

定义函数¶

# 系统自带的函数群

"""
Available individual functions are:

‘add’ : addition, arity=2.
‘sub’ : subtraction, arity=2.
‘mul’ : multiplication, arity=2.
‘div’ : protected division where a denominator near-zero returns 1., arity=2.
‘sqrt’ : protected square root where the absolute value of the argument is used, arity=1.
‘log’ : protected log where the absolute value of the argument is used and a near-zero argument returns 0., arity=1.
‘abs’ : absolute value, arity=1.
‘neg’ : negative, arity=1.
‘inv’ : protected inverse where a near-zero argument returns 0., arity=1. 
‘max’ : maximum, arity=2.
‘min’ : minimum, arity=2.
‘sin’ : sine (radians), arity=1.
‘cos’ : cosine (radians), arity=1.
‘tan’ : tangent (radians), arity=1.
"""

# init_function = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min', 'sin', 'cos', 'tan']
init_function = ['add', 'sub', 'mul', 'div']
# 自定义函数, make_function函数群

def _rolling_rank(data):
    value = rankdata(data)[-1]
    
    return value

def _rolling_prod(data):
    
    return np.prod(data)

def _ts_sum(data):
    window=10
    value = np.array(pd.Series(data.flatten()).rolling(window).sum().tolist())
    value = np.nan_to_num(value)

    return value

def _sma(data):
    window=10
    value = np.array(pd.Series(data.flatten()).rolling(window).mean().tolist())
    value = np.nan_to_num(value)
    
    return value

def _stddev(data):
    window=10
    value = np.array(pd.Series(data.flatten()).rolling(window).std().tolist())
    value = np.nan_to_num(value)
    
    return value

def _ts_rank(data):
    window=10
    value = np.array(pd.Series(data.flatten()).rolling(10).apply(_rolling_rank).tolist())
    value = np.nan_to_num(value)
    
    return value

def _product(data):
    window=10
    value = np.array(pd.Series(data.flatten()).rolling(10).apply(_rolling_prod).tolist())
    value = np.nan_to_num(value)
    
    return value

def _ts_min(data):
    window=10
    value = np.array(pd.Series(data.flatten()).rolling(window).min().tolist())
    value = np.nan_to_num(value)
    
    return value

def _ts_max(data):
    window=10
    value = np.array(pd.Series(data.flatten()).rolling(window).max().tolist())
    value = np.nan_to_num(value)
    
    return value

def _delta(data):
    value = np.diff(data.flatten())
    value = np.append(0, value)

    return value

def _delay(data):
    period=1
    value = pd.Series(data.flatten()).shift(1)
    value = np.nan_to_num(value)
    
    return value

def _rank(data):
    value = np.array(pd.Series(data.flatten()).rank().tolist())
    value = np.nan_to_num(value)
    
    return value

def _scale(data):
    k=1
    data = pd.Series(data.flatten())
    value = data.mul(1).div(np.abs(data).sum())
    value = np.nan_to_num(value)
    
    return value

def _ts_argmax(data):
    window=10
    value = pd.Series(data.flatten()).rolling(10).apply(np.argmax) + 1 
    value = np.nan_to_num(value)
    
    return value

def _ts_argmin(data):
    window=10
    value = pd.Series(data.flatten()).rolling(10).apply(np.argmin) + 1 
    value = np.nan_to_num(value)
    
    return value

# make_function函数群
delta = make_function(function=_delta, name='delta', arity=1)
delay = make_function(function=_delay, name='delay', arity=1)
rank = make_function(function=_rank, name='rank', arity=1)
scale = make_function(function=_scale, name='scale', arity=1)
sma = make_function(function=_sma, name='sma', arity=1)
stddev = make_function(function=_stddev, name='stddev', arity=1)
product = make_function(function=_product, name='product', arity=1)
ts_rank = make_function(function=_ts_rank, name='ts_rank', arity=1)
ts_min = make_function(function=_ts_min, name='ts_min', arity=1)
ts_max = make_function(function=_ts_max, name='ts_max', arity=1)
ts_argmax = make_function(function=_ts_argmax, name='ts_argmax', arity=1)
ts_argmin = make_function(function=_ts_argmin, name='ts_argmin', arity=1)
ts_sum = make_function(function=_ts_sum, name='ts_sum', arity=1)

user_function = [delta, delay, rank, scale, sma, stddev, product, ts_rank, ts_min, ts_max, ts_argmax, ts_argmin, ts_sum]

定义适应度¶

def _my_metric(y, y_pred, w):
    value = np.sum(np.abs(y) + np.abs(y_pred))

    return value

my_metric = make_fitness(function=_my_metric, greater_is_better=True)

生成表达式¶

generations = 5
function_set = init_function + user_function
metric = my_metric
population_size = 100
random_state=0

est_gp = SymbolicTransformer(
                            feature_names=fields, 
                            function_set=function_set,
                            generations=generations,
                            metric=metric,
                            population_size=population_size,
                            tournament_size=20, 
                            random_state=random_state,
                         )

est_gp.fit(X_train, y_train)

# 将模型保存到本地
with open('gp_model.pkl', 'wb') as f:
    pickle.dump(est_gp, f)
# 获取较优的表达式

best_programs = est_gp._best_programs
best_programs_dict = {}

for p in best_programs:
    factor_name = 'alpha_' + str(best_programs.index(p) + 1)
    best_programs_dict[factor_name] = {'fitness':p.fitness_, 'expression':str(p), 'depth':p.depth_, 'length':p.length_}
     
best_programs_dict = pd.DataFrame(best_programs_dict).T
best_programs_dict = best_programs_dict.sort_values(by='fitness')
best_programs_dict 
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
depth expression fitness length
alpha_10 5 delta(div(stddev(stddev(ts_max(avg))), stddev(... 325.906 10
alpha_9 6 sub(ts_rank(delay(mul(delta(sma(open)), div(ts... 413.386 18
alpha_8 1 delta(pre_close) 1920.03 2
alpha_7 3 delta(delay(delay(open))) 2802.3 4
alpha_5 3 rank(ts_rank(div(high, pre_close))) 3241.39 5
alpha_6 3 rank(ts_rank(ts_argmin(high_limit))) 3241.4 4
alpha_4 6 mul(delay(mul(rank(ts_argmin(delta(low))), ts_... 19240.9 20
alpha_3 4 product(ts_argmin(product(ts_rank(pre_close)))) 7.74627e+10 5
alpha_2 2 product(delta(close)) 1.31043e+15 3
alpha_1 5 mul(product(sub(open, ts_sum(close))), ts_max(... 2.53308e+50 14
def alpha_factor_graph(num):
    # 打印指定num的表达式图

    factor = best_programs[num-1]
    print(factor)
    print('fitness: {0}, depth: {1}, length: {2}'.format(factor.fitness_, factor.depth_, factor.length_))

    dot_data = factor.export_graphviz()
    graph = graphviz.Source(dot_data)
    graph.render('images/alpha_factor_graph', format='png', cleanup=True)
    
    return graph

使用表达式构建因子(以alpha_10为例)¶

# 打印因子alpha_10的结构图

graph10 = alpha_factor_graph(10)
graph10
delta(div(stddev(stddev(ts_max(avg))), stddev(ts_rank(ts_argmin(close)))))
fitness: 325.9056451860095, depth: 5, length: 10
program 0 delta 1 div 0->1 2 stddev 1->2 6 stddev 1->6 3 stddev 2->3 4 ts_max 3->4 5 avg 4->5 7 ts_rank 6->7 8 ts_argmin 7->8 9 close 8->9
# alpha_10 delta(div(stddev(stddev(ts_max(avg))), stddev(ts_rank(ts_argmin(close)))))

def alpha_10(df):
    value1 = _stddev(_stddev(_ts_max(np.array(df['avg'].tolist()))))
    value2 = _stddev(_ts_rank(_ts_argmin(np.array(df['close'].tolist()))))
    value = delta(value1/value2)

    return value
    
stock_price['alpha_10'] = alpha_10(stock_price)

单因子分析构建的因子¶

# 设置起止时间
start_date = '2019-01-01'
end_date = '2019-06-01'

# 设置调仓周期
periods=(1, 5, 20)
# 设置分层数量
quantiles=5

# 设置股票池
scu='000300.XSHG'
securities = jq.get_index_stocks(scu)
# 获取需要分析的数据

factor_data = pd.DataFrame(columns=securities, index=stock_price.index)
for security in securities:
    stock_price = jq.get_price(security, start_date=start_date, end_date=end_date, fq='post', fields=fields)
    stock_price['pct'] = stock_price['close'].pct_change(periods=1)
    stock_price['alpha_10'] = alpha_10(stock_price)
    
    factor_data[security] = stock_price['alpha_10']
# 使用获取的因子值进行单因子分析
far = ja.analyze_factor(factor=factor_data, 
                        weight_method='avg', 
                        industry='jq_l1', 
                        quantiles=quantiles, 
                        periods=periods,
                        max_loss=0.25)
# 生成统计图表
far.create_full_tear_sheet(
    demeaned=False, group_adjust=False, by_group=False,
    turnover_periods=None, avgretplot=(5, 15), std_bar=False
)
分位数统计
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
min max mean std count count %
factor_quantile
1.0 -209.188979 0.337907 -1.892000 8.576601 4140 20.003865
2.0 -8.521210 0.845769 -0.256171 0.989321 4139 19.999034
3.0 -3.770525 2.599885 -0.023692 0.507251 4138 19.994202
4.0 -1.505961 8.112976 0.201420 0.787316 4139 19.999034
5.0 -0.439981 196.023430 1.565537 6.509158 4140 20.003865
-------------------------

收益分析
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
period_1 period_5 period_20
Ann. alpha 0.429 0.658 0.082
beta -0.040 0.000 0.136
Mean Period Wise Return Top Quantile (bps) 29.506 26.815 23.671
Mean Period Wise Return Bottom Quantile (bps) 24.146 22.783 23.058
Mean Period Wise Spread (bps) 5.360 3.856 0.444
<matplotlib.figure.Figure at 0x28bb44dc860>
<matplotlib.figure.Figure at 0x28bb45879e8>
<matplotlib.figure.Figure at 0x28bb6875a90>
<matplotlib.figure.Figure at 0x28bb6df39e8>
-------------------------

IC 分析
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
period_1 period_5 period_20
IC Mean -0.003 0.025 0.012
IC Std. 0.114 0.127 0.098
IR -0.024 0.194 0.118
t-stat(IC) -0.200 1.613 0.980
p-value(IC) 0.842 0.111 0.331
IC Skew 0.627 0.127 0.220
IC Kurtosis -0.248 -0.725 -0.707
<matplotlib.figure.Figure at 0x28bb6d4be80>