因子读取¶

#第一步-因子生成
import time
import jqdata
import datetime
from multiprocessing.dummy import Pool as ThreadPool
from jqfactor import Factor,calc_factors
import pandas as pd
from pandas import Panel,DataFrame,Series
import statsmodels.api as sm
import scipy.stats as st
from jqfactor import get_factor_values
from jqfactor import winsorize,winsorize_med,neutralize,standardlize
import pickle

import xlrd   # 手工输入156个因子太麻烦，所以我就在EXCEL里上传了,也可手工输入。
ExcelFile=xlrd.open_workbook('FactorTable.xlsx')
name=ExcelFile.sheet_names()
sheet=ExcelFile.sheet_by_name(name[0])
factor_quality=list(sheet.col_values(1))
factor_fundamental=list(sheet.col_values(2))[:28]
factor_mood=list(sheet.col_values(3))[:35]
factor_growth=list(sheet.col_values(4))[:8]
factor_risk=list(sheet.col_values(5))[:12]
factor_stock=list(sheet.col_values(6))[:15]

starttime=time.clock()

global g_index
global g_count
global g_factor_list
global g_univ_dict
global g_neu_factor
global g_factor_dict

g_index='000300.XSHG'
g_count=500

g_factor_list=factor_quality+factor_fundamental+factor_mood+factor_growth+factor_risk+factor_stock
g_neu_factor=factor_quality+factor_fundamental+factor_growth+factor_stock

g_factor_dict = {}

import cPickle as pickle
from six import StringIO

# 文件写入
#使用pickle模块从文件中重构python对象
content = pickle.dumps(g_factor_list) # 该方法返回字符串
write_file('JQFactorAuto/g_factor_list.pkl', content, append=False)

import cPickle as pickle
from six import StringIO

# 文件写入
#使用pickle模块从文件中重构python对象
content = pickle.dumps(g_neu_factor) # 该方法返回字符串
write_file('JQFactorAuto/g_neu_factor.pkl', content, append=False)

方法¶

获得因子数据方法¶

# 获取过去一段时间的交易日列表【间隔时间默认20个交易日】
def get_trade_dates(end,count=250,interval=20):
    date_list=list(jqdata.get_trade_days(end_date=end,count=count))
    date_list=date_list[::-1]
    date_list=list(filter(lambda x:date_list.index(x)%interval==0,date_list))
    date_list=date_list[::-1]
    return date_list


# 获得股票池
def get_stock_pool(date,index='all'):                    
    df=get_all_securities(types=['stock'],date=date)
    dayBefore=jqdata.get_trade_days(end_date=date,count=60)[0]      #上市不足60天
    df=df[df['start_date']<dayBefore]                               #上市不足count天的去掉
    universe_pool=list(df.index)
    if index=='all':
        stock_pool=universe_pool
    else:
        index_pool=get_index_stocks(index,date=date)
        stock_pool=list(set(index_pool)&set(universe_pool))
    return stock_pool

def get_stock_universe(trade_date_list,index='all'):               
    univ_list=[]
    univ_dict={}
    for date in trade_date_list:
        stock_pool=get_stock_pool(date,index)
        univ_list.append(stock_pool)
        univ_dict[date]=stock_pool
    return univ_list,univ_dict

# 获得申万一级行业
def get_Industry_by_day(date,stock_list):                                
    industry_set = ['801010', '801020', '801030', '801040', '801050', '801080', '801110', '801120', '801130', 
                  '801140', '801150', '801160', '801170', '801180', '801200', '801210', '801230', '801710',
                  '801720', '801730', '801740', '801750', '801760', '801770', '801780', '801790', '801880','801890']
    industry_se = Series(name=date,index=stock_list)
    
    for industry in industry_set:
        industry_stocks = get_industry_stocks(industry,date = date)
        industry_stocks = list(set(industry_stocks)&set(stock_list))
        industry_se.loc[industry_stocks] = industry
        
    return industry_se

"""def get_Industry_by_day(date):                                
    industry_set = ['801010', '801020', '801030', '801040', '801050', '801080', '801110', '801120', '801130', 
                  '801140', '801150', '801160', '801170', '801180', '801200', '801210', '801230', '801710',
                  '801720', '801730', '801740', '801750', '801760', '801770', '801780', '801790', '801880','801890']
    industry_df = pd.DataFrame(index=[date],columns=g_univ_dict[date])
    for industry in industry_set:
        industry_stocks = get_industry_stocks(industry,date = date)
        industry_stocks = list(set(industry_stocks)&set(g_univ_dict[date]))
        industry_df.loc[date,industry_stocks] = industry
    return industry_df

# 得到对应日期的行业数据
def get_industry_df(trade_date_list):
    all_industry_df = pd.DataFrame()
    for date in trade_date_list:
        data = get_Industry_by_day(date)
        all_industry_df = pd.concat([all_industry_df, data])
    return all_industry_df
    
all_industry_df = get_industry_df(trade_date_list)
# 行业日期类型
all_industry_df.index = list(map(lambda x:x.strftime("%Y-%m-%d"),all_industry_df.index))

    """



# 得到对应日期的所有因子数据
def get_jq_factor_by_day(date,stock_list):
    factor_dict=get_factor_values(securities=stock_list, factors=g_factor_list, start_date=date, end_date=date)
    return factor_dict

# 行业空值使用中位数替换
def replace_nan_indu(factor_se,indu_se):
    #factor_se = factor_se.apply(np.float)
    #indu_se = indu_se.dropna().apply(np.int)
    
    # 因子值
    factor_array = factor_se.to_frame('values')
    # 行业值
    indu_array = indu_se.dropna().to_frame('industryName1')
    # 合并
    factor_array = factor_array.merge(indu_array, left_index=True, right_index=True, how='inner')
    
    # 行业中值
    mid = factor_array.groupby('industryName1').median()
    factor_array = factor_array.merge(mid, left_on='industryName1', right_index=True, how='left')
    # 行业中值填充缺失
    factor_array['values_x'][pd.isnull(factor_array['values_x'])] = factor_array['values_y'][pd.isnull(factor_array['values_x'])]
    
    return factor_array['values_x']

# 得到某个日期的最终因子dataframe
def get_final_factors(date):
    # 得到某日因子数据
    stock_list = get_stock_pool(date,index=g_index)
    # 得到对应日期的所有因子数据
    factor_dict = get_jq_factor_by_day(date,stock_list)
    # 得到行业数据
    indu_se = get_Industry_by_day(date,stock_list)
    #indu_se = all_industry_df.ix[date]
    # 因子数据
    factor_df = DataFrame()
    for fac in list(g_factor_list):
        # 因子se
        factor_se = factor_dict[fac].iloc[0]
        # 行业空值替换
        factor_se = replace_nan_indu(factor_se,indu_se)
        # 去极值、中性化、标准化处理
        factor_se=winsorize_med(factor_se, scale=3, inclusive=True, inf2nan=True, axis=1)   # 去极值
        # 如果因子在需要中性化处理的因子列表中，则进行中性化处理
        if fac in g_neu_factor:
            factor_se=neutralize(factor_se, how=['jq_l1', 'market_cap'], date=date, axis=1)     # 中性化
        factor_se=standardlize(factor_se, inf2nan=True, axis=0)                             # 标准化

        tmp_df = factor_se.to_frame(fac)
        factor_df = pd.concat([factor_df,tmp_df],axis=1)
        
    return factor_df

def get_all_final_factors(trade_date_list):
    pool=ThreadPool(processes=len(trade_date_list))
    frame_list=pool.map(get_final_factors,trade_date_list)
    pool.close()
    pool.join()

    
def get_all_final_factors(trade_date_list):
    
    for date in trade_date_list:
        # 每个年份保存一个文件
        year = date[:4]
        # 如果不存在文件，则新建
        try:
            pkl_file = open('g_factor_dict_%s.pkl'%year, 'rb')
            g_factor_dict_tmp = pickle.load(pkl_file)
        except:
            g_factor_dict_tmp = {}
        
        # 如果已经计算过，则跳过循环
        if date in g_factor_dict_tmp.keys():
            print("跳过：%s"%date)
            continue
            
        # 计算
        g_factor_dict_tmp[date] = get_final_factors(date)
        
        # 保存数据
        print("已完成：%s"%date)
        pkl_file = open('g_factor_dict_%s.pkl'%year, 'wb')
        pickle.dump(g_factor_dict_tmp,pkl_file,0)
        pkl_file.close()

当前日期¶

# 当前日期
#today=datetime.date.today() 
today = '2018-11-13'
today = '2015-07-31' # 开始时间
today = '2016-03-02'
g_count=500
yesterday=jqdata.get_trade_days(end_date=today,count=2)[0]  
print("today:%s"%today)
print("yesterday:%s"%yesterday)

today:2016-03-02
yesterday:2016-03-01

第一步：获得因子数据¶

过去两年的交易日【每隔20交易日】¶

# 过去两年的交易日【每隔20交易日】
g_count=250*2
g_index='000300.XSHG'

trade_date_list=get_trade_dates(yesterday,g_count,20)  
trade_date_list = list(map(lambda x:x.strftime("%Y-%m-%d"),trade_date_list))
trade_date_list.reverse()

trade_date_list

['2016-03-01',
 '2016-01-26',
 '2015-12-28',
 '2015-11-30',
 '2015-11-02',
 '2015-09-28',
 '2015-08-27',
 '2015-07-30',
 '2015-07-02',
 '2015-06-03',
 '2015-05-06',
 '2015-04-07',
 '2015-03-09',
 '2015-02-02',
 '2015-01-05',
 '2014-12-04',
 '2014-11-06',
 '2014-10-09',
 '2014-09-03',
 '2014-08-06',
 '2014-07-09',
 '2014-06-11',
 '2014-05-13',
 '2014-04-11',
 '2014-03-13']

开始计算¶

starttime=time.clock()
get_all_final_factors(trade_date_list)
endtime=time.clock()
runtime=endtime-starttime
print('因子生成运行完成，用时 %.2f 秒' % runtime)

跳过：2016-03-01
跳过：2016-01-26
跳过：2015-12-28
跳过：2015-11-30
跳过：2015-11-02
跳过：2015-09-28
跳过：2015-08-27
跳过：2015-07-30
跳过：2015-07-02
跳过：2015-06-03
跳过：2015-05-06
跳过：2015-04-07
跳过：2015-03-09
跳过：2015-02-02
跳过：2015-01-05
跳过：2014-12-04
跳过：2014-11-06
跳过：2014-10-09
跳过：2014-09-03
跳过：2014-08-06
跳过：2014-07-09
跳过：2014-06-11
跳过：2014-05-13
跳过：2014-04-11
跳过：2014-03-13
因子生成运行完成，用时 17.83 秒

第二步：因子检验¶

读取因子数据¶

# 当前日期
print("today:%s"%today)
print("yesterday:%s"%yesterday)

# 过去两年的交易日【每隔20交易日】
g_count=250*2
g_index='000300.XSHG'

trade_date_list=get_trade_dates(yesterday,g_count,20)  
trade_date_list = list(map(lambda x:x.strftime("%Y-%m-%d"),trade_date_list))

# 得到需要读取的年份
years = list(set(map(lambda x:x[:4],trade_date_list)))

# 读取得到字典
g_factor_dict = {}
for year in years:
    pkl_file = open('g_factor_dict_%s.pkl'%year, 'rb')
    g_factor_dict_tmp = pickle.load(pkl_file)
    g_factor_dict.update(g_factor_dict_tmp)

# 删除不必要的日期
for k in g_factor_dict.keys():
    if k not in trade_date_list:
        del g_factor_dict[k]
    
## 因子格式转换
p = Panel(g_factor_dict)
all_factor_dict = {}
for fac in p.minor_axis:
    all_factor_dict[fac] = p[:,:,fac].T

today:2016-03-02
yesterday:2016-03-01

获取收益数据¶

def get_return(trade_date_list,count=250):     #小概率风险：一个股票曾经是指数成分股而如今已经退市      
    date=max(trade_date_list)
    universe=get_stock_pool(date,index='all')
    price=get_price(universe,end_date=date,count=count,fields=['close'],fq='pre')['close']
    price.index = list(map(lambda x:x.strftime("%Y-%m-%d"),price.index))
    return_df=price.loc[trade_date_list].pct_change().shift(-1)
    all_return_df=price.pct_change().shift(-1)
    return return_df,all_return_df

return_df,all_return_df=get_return(trade_date_list,count=g_count)           # 获得所有股票的历史回报  (all stocks)

获取股票池¶

def get_stock_universe(trade_date_list,index='all'):               
    univ_list=[]
    univ_dict={}
    for date in trade_date_list:
        stock_pool=get_stock_pool(date,index)
        univ_list.append(stock_pool)
        univ_dict[date]=stock_pool
    return univ_list,univ_dict

print('获取股票池')
univ_list,univ_dict=get_stock_universe(trade_date_list,index=g_index)      # 获取股票池

获取股票池

第二步：因子检验¶

读取数据¶

# Step II: 因子筛选用到的函数
def ic_calculator(factor,return_df,univ_dict):
    ic_list=[]
    p_value_list=[]
    for date in sorted(list(univ_dict.keys())):   #这里是循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
        #if len(univ)<10:
        #    continue
        factor_se=factor.loc[date,univ]
        return_se=return_df.loc[date,univ]
        ic,p_value=st.spearmanr(factor_se,return_se)
        ic_list.append(ic)
        p_value_list.append(p_value)
    return ic_list

# 1.回测基础数据计算
def all_Group_Return_calculator(factor,univ_dict,all_return_df,GroupNum=10):
    all_date_list=list(all_return_df.index) # 全部日期
    date_list=sorted(list(univ_dict.keys())) # 交易日
    all_Group_Ret_df=pd.DataFrame(index=all_date_list,columns=list(np.array(range(GroupNum)))) #根据交易日构造dataframe
    for n in range(len(date_list)-1):
        start=date_list[n] # 开始日期
        end=date_list[n+1] # 结束日期
        univ=univ_dict[start] # 开始日期的股票池
        univ=set(univ)&set(factor.loc[start].dropna().index) # 和因子池的并集
        factor_se_stock=list(factor.loc[start,univ].dropna().to_frame('a').sort('a',ascending=False).index) # 排序后的因子
        
        N=len(factor_se_stock)
        for i in range(GroupNum):
            group_stock=factor_se_stock[int(N/GroupNum*i):int(N/GroupNum*(i+1))]
            # 下面两行是关键
            cumret=(all_return_df.loc[start:end,group_stock]+1).cumprod().mean(axis=1)
            all_Group_Ret_df.loc[start:end,i]=cumret.shift(1).fillna(1).pct_change().shift(-1)
            #(((all_return_df.loc[start:end,group_stock]+1).cumprod()-1).mean(axis=1)+1).pct_change().shift(-1)
        
    all_Group_Ret_df=all_Group_Ret_df[date_list[0]:].shift(1).fillna(0)
    return all_Group_Ret_df

#list(factor.loc[date,univ].dropna().to_frame('a').sort('a',ascending=False).index)

# 分组收益计算
def Group_Return_calculator(factor,univ_dict,return_df,GroupNum=10):
    GroupRet_df=pd.DataFrame(index=sorted(list(univ_dict.keys())),columns=list(np.array(range(GroupNum))))
    for date in sorted(list(univ_dict.keys())):    #这个也是个循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
        factor_se_stock=list(factor.loc[date,univ].dropna().to_frame('a').sort('a',ascending=False).index)
        N=len(factor_se_stock)
        for i in range(GroupNum):
            group_stock=factor_se_stock[int(N*1.0/GroupNum*i):int(N*1.0/GroupNum*(i+1))]
            GroupRet_df.loc[date,i]=return_df.loc[date,group_stock].mean()
    return GroupRet_df.shift(1).fillna(0)

# 指数收益计算
def get_index_return(univ_dict,index,count=250):
    trade_date_list=sorted(list(univ_dict.keys()))
    date=max(trade_date_list)
    price=get_price(index,end_date=date,count=count,fields=['close'])['close']
    price.index = list(map(lambda x:x.strftime("%Y-%m-%d"),price.index))
    price_return=price.loc[trade_date_list[0]:].pct_change().fillna(0)
    price_return_by_tradeday=price.loc[trade_date_list].pct_change().fillna(0)
    return price_return,price_return_by_tradeday

# 因子检验
def effect_test(univ_dict,key,group_return,index_return,group_excess_return):
    
    start = sorted(list(univ_dict.keys()))[0]
    end = sorted(list(univ_dict.keys()))[-1]
    start = datetime.datetime.strptime(start, '%Y-%m-%d')
    end = datetime.datetime.strptime(end, '%Y-%m-%d')
    daylength=(end-start).days
    annual_return=np.power(cumprod(group_return+1).iloc[-1,:],365.0/daylength)
    index_annual_return=np.power((index_return+1).cumprod().iloc[-1],365.0/daylength)

    # Test One: 组合序列与组合收益的相关性，相关性大于0.5
    sequence=pd.Series(np.array(range(10)))
    test_one_corr=annual_return.corr(sequence)
    test_one_passgrade=0.4
    test_one_pass=abs(test_one_corr)>test_one_passgrade
    
    if test_one_corr<0:
        wingroup,losegroup=0,9
    else:
        wingroup,losegroup=9,0
        
    # Test Two: 赢家组合明显跑赢市场，输家组合明显跑输市场，程度大于5%     
    test_two_passgrade=0.05
    test_two_win_excess=annual_return[wingroup]-index_annual_return
    test_two_win_pass=test_two_win_excess>test_two_passgrade
    test_two_lose_excess=index_annual_return-annual_return[losegroup]
    test_two_lose_pass=test_two_lose_excess>test_two_passgrade
    test_two_pass=test_two_win_pass&test_two_lose_pass

    # Test Tree: 高收益组合跑赢基准的概率，低收益组合跑赢基准的概率，概率大小0.5
    test_three_grade=0.5
    test_three_win_prob=(group_excess_return[wingroup]>0).sum()*1.0/len(group_excess_return[wingroup])
    test_three_win_pass=test_three_win_prob>0.5
    test_three_lose_prob=(group_excess_return[losegroup]<0).sum()*1.0/len(group_excess_return[losegroup])
    test_three_lose_pass=test_three_lose_prob>0.5
    test_three_pass=test_three_win_pass&test_three_lose_pass

    test_result=[test_one_pass,test_two_win_pass,test_two_lose_pass,test_three_win_pass,test_three_lose_pass]
    test_score=[test_one_corr,test_two_win_excess,test_two_lose_excess,test_three_win_prob,test_three_lose_prob]
    
    return test_result,test_score

计算每个因子的评分和筛选结果¶

# 计算每个因子的评分和筛选结果

starttime=time.clock()

print('\n计算IC_IR:')
count=1
ic_list_dict={}
for key,factor in all_factor_dict.items():
    ic_list=ic_calculator(factor,return_df,univ_dict)
    ic_list_dict[key]=ic_list
    print(count)
    count=count+1
# 整理结果
ic_df=pd.DataFrame(ic_list_dict,index=sorted(list(univ_dict.keys())))
ic_df = ic_df.iloc[:-1]

计算IC_IR:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

IC延续性¶

ic_df

count = 0
for col in ic_df.columns:
    tmpdf  = ic_df[[col]]
    pct = 1.0*len(tmpdf[tmpdf[col]>0])/len(tmpdf) # IC大于0的占比
    if pct < 0.4 or pct > 0.6 :
        count += 1
print "IC存在偏向和延续的概率：%.2f%%"%(count / len(ic_df.columns))

IC存在偏向和延续的概率：0.00%

计算分组收益¶

# 计算分组收益
print('\n计算分组收益:')
count=1
GroupNum=10
all_Factor_Group_Return_dict={}                    ##这个用于计算NAV，再筛选出因子之后再用更效率
Factor_Group_Return_dict={}
for key,factor in all_factor_dict.items():
# 全return    
    #all_GroupRet_df=all_Group_Return_calculator(factor,univ_dict,all_return_df,GroupNum)
    #all_Factor_Group_Return_dict[key]=all_GroupRet_df.sort_index()
# 调仓期return    
    GroupRet_df=Group_Return_calculator(factor,univ_dict,return_df,GroupNum)   
    Factor_Group_Return_dict[key]=GroupRet_df.sort_index()
    print(count)
    count=count+1

计算分组收益:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

计算指数收益¶

# 计算指数收益
print('\n计算指数收益:')
count=1
index='000300.XSHG'
index_return,index_return_by_tradeday=get_index_return(univ_dict,index)
Factor_Group_Excess_Return_dict={}
for key,group_return in Factor_Group_Return_dict.items():
    Factor_Group_Excess_Return_dict[key]=group_return.subtract(index_return_by_tradeday,axis=0)
    print(count)
    count=count+1

计算指数收益:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

因子有效性测试¶

# 因子有效性测试
print('\n因子有效性测试:')
count=1
effect_test_result_dict={}
effect_test_score_dict={}
for key,group_return in Factor_Group_Return_dict.items():
    group_excess_return=Factor_Group_Excess_Return_dict[key]   
    effect_test_result_dict[key],effect_test_score_dict[key]=effect_test(univ_dict,key,group_return,index_return,group_excess_return)
    print(count)
    count=count+1

因子有效性测试:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

第三步：因子筛选¶

ic_ir_se=ic_df.mean()/ic_df.std()
ic_avg_se=ic_df.mean().abs()
EffectTestresult=pd.concat([ic_avg_se.to_frame('a'),ic_ir_se.to_frame('b'),pd.DataFrame(effect_test_result_dict).T],axis=1)
columns=['IC','ICIR','测试一', '测试二-胜者组', '测试二-败者组', '测试三-胜者组', '测试三-败者组']
EffectTestresult.columns=columns
EffectTestresult2=pd.concat([ic_avg_se.to_frame('a'),ic_ir_se.to_frame('b'),pd.DataFrame(effect_test_score_dict).T],axis=1)
columns=['IC','ICIR','测试一', '测试二-胜者组', '测试二-败者组', '测试三-胜者组', '测试三-败者组']
EffectTestresult2.columns=columns
EffectTestresult

EffectTestresult2

因子IC和ICIR筛选 quantile¶

EffectTestresult['IC'].hist()
IC_ratio = EffectTestresult['IC'].quantile(0.80)
IC_ratio

0.029812437613462698

EffectTestresult['ICIR'].abs().hist()
ICIR_ratio = EffectTestresult['ICIR'].abs().quantile(0.75)
ICIR_ratio

0.21943197512864868

#筛选有效因子
# IC大于0.07，ICIR大于0.4，测试一，测试二-胜者组，测试三-胜者组，必须通过
# 测试二、测试三中要至少通过3个。
index_ic=EffectTestresult['IC']>IC_ratio
index_icir=EffectTestresult['ICIR'].abs()>ICIR_ratio
test_index=all(EffectTestresult.iloc[:,[2,3,5]],axis=1)
test2_index=sum(EffectTestresult.iloc[:,3:7],axis=1)>=3
filter_index=index_ic&index_icir&test_index&test2_index
EffectFactorresult=EffectTestresult.loc[filter_index,:]
# 生成有效因子字典
EffectFactor=list(EffectFactorresult.index)
Effect_factor_dict={key:value for key,value in all_factor_dict.items() if key in EffectFactor}
EffectFactorresult

IC排序，相关性筛选¶

# IC排序
effect_fac_list = EffectFactorresult.sort('IC',ascending=False).index.tolist()

def Group_Score_calculator(factor,univ_dict,signal,GroupNum=20):
    
    Score_df=pd.DataFrame(index=list(factor.index),columns=list(factor.columns))

    for date in sorted(list(univ_dict.keys())):    #这个也是个循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index))
        factor_se_stock=list(factor.loc[date,univ].to_frame('a').sort('a',ascending=False).index)
        N=len(factor_se_stock)
        for i in range(GroupNum):
            group_stock=factor_se_stock[int(N/GroupNum*i):int(N/GroupNum*(i+1))]
            if signal=='ascending':
                Score_df.loc[date,group_stock]=i
            else:
                Score_df.loc[date,group_stock]=GroupNum-i  
   
    return Score_df  

# 计算相关性矩阵
def factor_corr_calculator(Group_Score_dict,univ_dict):

    Group_Score_dict_by_day={}
    Group_Score_Corr_dict_by_day={}
    
    # 每日的因子序列
    for Date in sorted(list(univ_dict.keys())):
        Group_Score_df=pd.DataFrame()
        univ=univ_dict[Date]
        for Factor in list(Group_Score_dict.keys()):
            Group_Score_df=Group_Score_df.append(Group_Score_dict[Factor].loc[Date,univ].to_frame(Factor).T)
        Group_Score_dict_by_day[Date]=Group_Score_df.T.fillna(4.5)
        Group_Score_Corr_dict_by_day[Date]=Group_Score_dict_by_day[Date].corr()

    # 算平均数
    N=len(list(univ_dict.keys()))
    Group_Score_Corr=Group_Score_Corr_dict_by_day[sorted(list(univ_dict.keys()))[0]]
    for Date in sorted(list(univ_dict.keys()))[1:]:
        Group_Score_Corr=Group_Score_Corr+Group_Score_Corr_dict_by_day[Date]

    return np.round(Group_Score_Corr/N,2)

# 给因子赋值
Group_Score_dict={}
for key,factor in Effect_factor_dict.items():
    signal='ascending' if ic_ir_se[key]>0 else 'descending'
    Group_Score_dict[key]=Group_Score_calculator(factor,univ_dict,signal,20)

# 计算因子相关系数    
factor_corrmatrix=factor_corr_calculator(Group_Score_dict,univ_dict)
factor_corrmatrix

fac_corr = factor_corrmatrix

# 相关性大于0.95的因子B剔除
MinCorr = 0.9
result_fac_list = effect_fac_list[:1]
for fac in effect_fac_list:
    # 如果因子已经在结果列表中，则继续循环
    if fac in result_fac_list:
        continue

    fac_corr_se = fac_corr[fac]
    # 得到相关性大于0.95的因子
    fac_corr_list = fac_corr_se[fac_corr_se > MinCorr].index.tolist()
    # 相关因子结合与结果因子集合有没有交集，则将因子添加到结果因子集合中
    if len(set(fac_corr_list) & set(result_fac_list)) <= 0:
        result_fac_list.append(fac)

result_fac_list = sorted(result_fac_list)
result_fac_list

[]

result_fac_ic_se = EffectTestresult['IC'].loc[result_fac_list]
result_fac_ic_se

Series([], Name: IC, dtype: float64)

date = today
g_factor_list
stock_list = get_stock_pool(date,index=g_index)

len(stock_list)

299

factor_dict=get_factor_values(securities=stock_list, factors=result_fac_ic_se.index.tolist(), start_date=date, end_date=date)

需要对因子进行去极值、中性化、标准化处理¶

# 得到行业数据
indu_se = get_Industry_by_day(date,stock_list)

# 因子数据
factor_df = DataFrame()
for fac in list(factor_dict.keys()):
    # 因子se
    factor_se = factor_dict[fac].iloc[0]
    # 行业空值替换
    factor_se = replace_nan_indu(factor_se,indu_se)
    # 去极值、中性化、标准化处理
    factor_se=winsorize_med(factor_se, scale=3, inclusive=True, inf2nan=True, axis=1)   # 去极值
    # 如果因子在需要中性化处理的因子列表中，则进行中性化处理
    if fac in g_neu_factor:
        factor_se=neutralize(factor_se, how=['jq_l1', 'market_cap'], date=date, axis=1)     # 中性化
    factor_se=standardlize(factor_se, inf2nan=True, axis=0)                             # 标准化

    tmp_df = factor_se.to_frame(fac)
    factor_df = pd.concat([factor_df,tmp_df],axis=1)

fianl_factor_df = factor_df.T

ic_se = ic_df[result_fac_list].iloc[-1]

stocks_to_buy = 30
final_stock_list = fianl_factor_df.multiply(ic_se,axis=0).sum().to_frame('a').sort('a',ascending=False).index[0:stocks_to_buy]

final_stock_list

Index([], dtype='object')

	ACCA	AR	ARBR	ATR14	ATR6	BR	DAVOL10	DAVOL20	DAVOL5	DEGM	...	total_asset_turnover_rate	total_operating_cost_ttm	total_operating_revenue_per_share	total_operating_revenue_per_share_ttm	total_operating_revenue_ttm	total_profit_growth_rate	total_profit_to_cost_ratio	total_profit_ttm	turnover_volatility	value_change_profit_ttm
2014-03-13	0.047200	-0.162499	0.073416	-0.309763	-0.281965	-0.231009	-0.147443	-0.175845	-0.081170	0.030419	...	-0.020423	0.177860	0.172653	0.164485	0.190432	-0.066611	-0.110937	0.168022	-0.322969	0.133184
2014-04-11	0.036714	0.194380	0.108152	-0.139012	-0.118406	0.133141	0.116173	0.091247	0.111316	-0.047518	...	-0.044774	-0.136920	-0.081272	-0.088791	-0.139804	-0.087635	0.007073	-0.114859	-0.103820	0.032107
2014-05-13	-0.000212	-0.226541	-0.009925	0.132893	0.085662	-0.199578	-0.172168	-0.165305	-0.206175	0.126938	...	-0.050430	0.087289	0.051213	0.094782	0.090727	0.117277	-0.056055	0.086737	-0.173069	0.003919
2014-06-11	-0.040416	-0.030739	-0.021044	-0.072618	-0.083106	-0.026398	-0.068965	-0.033872	-0.103080	-0.126065	...	-0.030833	-0.027832	-0.043807	-0.022776	-0.040315	-0.014275	-0.125476	-0.075107	-0.048262	-0.043109
2014-07-09	0.072393	-0.073307	0.058638	-0.225825	-0.213875	-0.079134	-0.035755	-0.035135	-0.006113	-0.056671	...	0.079728	0.190624	0.201080	0.175578	0.200589	-0.160514	-0.061726	0.152767	-0.209181	0.086706
2014-08-06	-0.072160	-0.327378	0.050347	0.041588	0.028097	-0.402522	-0.339449	-0.290965	-0.307473	-0.122475	...	-0.106146	-0.248803	-0.170863	-0.177226	-0.253778	0.023571	0.004129	-0.305918	-0.024571	-0.106277
2014-09-03	0.183476	0.166382	0.117706	-0.202130	-0.205865	0.081151	0.171634	0.220472	0.105560	-0.021080	...	0.047510	-0.036580	-0.072564	-0.060323	-0.066790	-0.199037	-0.171860	-0.243713	0.069956	-0.045877
2014-10-09	0.079243	0.216416	0.040139	-0.334901	-0.317678	0.184767	0.144671	0.099249	0.140093	0.070633	...	0.047885	0.181602	0.042069	0.051591	0.170156	-0.070369	-0.094806	0.124304	-0.075422	0.187840
2014-11-06	0.107623	0.254242	0.071531	-0.244694	-0.216206	0.197369	0.246151	0.148685	0.344854	-0.046953	...	0.046547	0.189635	0.087891	0.099721	0.189745	-0.119383	-0.036733	0.155362	-0.086452	0.159541
2014-12-04	0.070791	0.494906	-0.228191	-0.217855	-0.175923	0.519703	0.427885	0.443558	0.421980	0.013248	...	0.046111	0.291692	0.187180	0.195063	0.309195	-0.019766	-0.053584	0.329402	-0.034908	0.167332
2015-01-05	-0.176229	-0.593839	0.059320	0.198294	0.194665	-0.560011	-0.604952	-0.575681	-0.570806	-0.000854	...	-0.009335	-0.246806	-0.114114	-0.133284	-0.258065	0.117908	0.104454	-0.157458	-0.183785	-0.210673
2015-02-02	-0.005689	-0.026048	-0.014286	-0.017713	0.024080	-0.029373	0.080398	-0.044620	0.078978	0.036360	...	0.041179	-0.163963	-0.052264	-0.044623	-0.163946	0.171416	0.019793	-0.220149	0.165527	-0.048715
2015-03-09	0.026273	0.131523	0.146415	0.024367	0.027291	-0.007881	-0.056378	-0.037774	-0.086783	0.009016	...	0.040375	-0.004725	-0.042434	-0.047514	-0.016829	-0.000389	-0.095542	-0.017712	0.136583	0.084405
2015-04-07	-0.029815	-0.101676	0.027478	-0.216674	-0.219848	-0.092041	-0.125115	-0.153025	-0.148122	0.037000	...	-0.016275	0.183953	0.032455	0.041570	0.196238	-0.000499	-0.064900	0.101965	-0.097953	0.111724
2015-05-06	-0.046848	-0.135332	0.069100	0.273689	0.270338	-0.165272	-0.122701	-0.127371	-0.084481	-0.019426	...	-0.049704	-0.221286	-0.181689	-0.154368	-0.239224	-0.108290	-0.028808	-0.265511	0.061536	-0.216903
2015-06-03	0.066756	-0.274657	0.316860	-0.257891	-0.291975	-0.386341	-0.150172	-0.260568	-0.152281	0.086080	...	0.109094	0.274057	0.213587	0.188194	0.304496	0.038556	0.045125	0.301082	-0.306986	0.110210
2015-07-02	-0.169283	-0.123858	0.051503	0.081614	0.063758	-0.243420	-0.157666	-0.190835	-0.120788	0.058753	...	-0.127043	-0.151049	-0.178163	-0.129954	-0.163691	0.059118	0.053213	-0.123827	-0.209467	-0.172912
2015-07-30	0.034665	-0.213317	-0.060590	-0.120632	-0.186212	-0.170176	-0.342672	-0.117837	-0.370694	-0.050241	...	-0.028573	0.121466	0.056898	0.046520	0.130536	-0.191123	0.008495	0.087274	-0.249106	0.048845
2015-08-27	-0.035067	-0.124292	-0.059187	0.081110	0.077700	-0.109202	0.060253	0.044496	0.070619	-0.008101	...	0.001664	0.012053	0.018589	-0.008497	0.011183	-0.007299	0.047434	0.071043	-0.105111	-0.003825
2015-09-28	0.048052	-0.174652	-0.110641	0.350810	0.364360	-0.082053	0.347133	0.336875	0.378152	-0.035018	...	0.052106	-0.278746	-0.014263	-0.036665	-0.265917	0.092505	0.119509	-0.129747	0.156764	-0.152218
2015-11-02	-0.169341	0.173895	0.052161	0.027404	0.086556	0.125994	0.042939	0.069355	0.085641	-0.047459	...	-0.147803	-0.172801	-0.145262	-0.131982	-0.152701	0.036559	0.217130	0.013376	0.028607	0.037462
2015-11-30	0.105001	0.010290	0.012644	0.174967	0.152275	0.059461	0.061660	0.077557	0.054064	0.026028	...	0.132630	0.148941	0.265021	0.247416	0.154991	0.055242	-0.028890	0.190370	0.010043	0.076753
2015-12-28	0.108269	0.061193	0.052929	-0.222986	-0.254030	0.054503	-0.285090	-0.242189	-0.308712	0.108666	...	0.133524	0.325140	0.251788	0.235202	0.334696	-0.040853	-0.099587	0.256852	-0.375087	0.079161
2016-01-26	0.067380	0.003680	-0.044531	0.000308	0.020704	0.077478	-0.035031	0.033782	-0.041382	0.012873	...	0.116464	0.140025	0.136689	0.142024	0.133866	0.014959	-0.057859	0.101760	-0.120482	-0.034535

	IC	ICIR	测试一	测试二-胜者组	测试二-败者组	测试三-胜者组	测试三-败者组
ACCA	0.012866	0.139050	-0.347761	0.428992	-0.240983	0.60	0.36
AR	0.036718	-0.160995	0.633776	0.311248	-0.298716	0.56	0.40
ARBR	0.031664	0.314587	-0.622252	0.485234	-0.067798	0.68	0.44
ATR14	0.049819	-0.258804	0.682412	0.614150	-0.179128	0.64	0.36
ATR6	0.048733	-0.255409	0.765657	0.597185	-0.144671	0.64	0.40
BR	0.056285	-0.247606	0.842111	0.358631	-0.091797	0.64	0.48
DAVOL10	0.039361	-0.170452	0.492066	0.327177	-0.075492	0.64	0.52
DAVOL20	0.036906	-0.170552	0.626611	0.319072	-0.187351	0.64	0.44
DAVOL5	0.033200	-0.141448	0.384998	0.324972	-0.081628	0.64	0.44
DEGM	0.001423	0.022272	-0.245694	0.370361	-0.250382	0.72	0.48
EBIT	0.024991	0.126466	-0.492138	0.302279	-0.310369	0.60	0.36
EBITDA	0.023415	0.114423	-0.427733	0.433205	-0.304974	0.60	0.36
Kurtosis120	0.016839	0.094716	-0.367350	0.461381	-0.211368	0.64	0.32
Kurtosis20	0.023001	-0.186492	0.675492	0.340002	-0.230069	0.60	0.40
Kurtosis60	0.002823	-0.018834	0.067572	0.298060	-0.383575	0.52	0.36
MAWVAD	0.012968	0.053739	-0.420102	0.214962	-0.120384	0.56	0.40
MLEV	0.029706	0.231814	-0.484990	0.589323	-0.196004	0.80	0.44
OperateNetIncome	0.010827	0.069878	-0.223968	0.282673	-0.308166	0.56	0.44
OperatingCycle	0.007722	-0.076538	0.108109	0.363148	-0.329961	0.60	0.28
ROAEBITTTM	0.020850	-0.172085	0.398922	0.270327	-0.192646	0.60	0.32
Skewness120	0.017286	-0.115842	0.264691	0.437910	-0.430163	0.72	0.44
Skewness20	0.022001	-0.167139	0.266950	0.181838	-0.282793	0.52	0.24
Skewness60	0.018554	-0.146239	0.163989	0.312804	-0.403326	0.56	0.32
TVMA20	0.096683	-0.475439	0.938999	0.479491	-0.164642	0.64	0.56
TVMA6	0.102510	-0.448285	0.888644	0.492120	-0.086890	0.68	0.52
TVSTD20	0.108780	-0.545369	0.897024	0.479845	-0.124894	0.68	0.48
TVSTD6	0.097882	-0.454276	0.910318	0.470792	-0.126774	0.60	0.56
VDEA	0.006973	0.043340	0.280343	0.314561	-0.312307	0.52	0.36
VDIFF	0.015364	0.114978	-0.338846	0.351872	-0.246974	0.56	0.32
VEMA10	0.005741	-0.025909	0.126447	0.447497	-0.324396	0.60	0.48
...	...	...	...	...	...	...	...
operating_profit_ttm	0.018949	0.107587	-0.361620	0.326432	-0.345141	0.60	0.32
operating_revenue_growth_rate	0.003884	0.043997	-0.133464	0.399895	-0.226262	0.60	0.48
operating_revenue_per_share	0.026234	0.186531	-0.381663	0.523079	-0.286017	0.60	0.48
operating_revenue_per_share_ttm	0.027035	0.204013	-0.322013	0.561686	-0.260936	0.72	0.44
operating_revenue_ttm	0.027597	0.140095	-0.469427	0.452515	-0.246426	0.72	0.36
operating_tax_to_operating_revenue_ratio_ttm	0.007541	0.087353	-0.258939	0.266964	-0.399442	0.60	0.36
quick_ratio	0.021461	-0.243853	0.429697	0.304762	-0.210931	0.68	0.36
retained_earnings	0.021211	0.103753	-0.432607	0.417680	-0.371353	0.68	0.32
retained_earnings_per_share	0.014394	0.098722	0.227294	0.419124	-0.303787	0.64	0.36
retained_profit_per_share	0.015068	0.106958	0.186659	0.376731	-0.304230	0.60	0.28
roa_ttm	0.019886	-0.179419	0.358638	0.304997	-0.238565	0.68	0.32
roe_ttm	0.008612	-0.074070	0.273802	0.208943	-0.238217	0.60	0.36
sale_expense_to_operating_revenue	0.027127	-0.344554	0.570582	0.436612	-0.168843	0.68	0.40
sale_expense_ttm	0.007708	0.046442	-0.455231	0.400599	-0.336663	0.64	0.36
sharpe_ratio_120	0.114723	0.554884	-0.876687	0.628048	-0.063511	0.72	0.36
sharpe_ratio_20	0.096581	0.517602	-0.754790	0.485422	-0.139325	0.64	0.32
sharpe_ratio_60	0.119046	0.556540	-0.928760	0.551859	-0.141734	0.64	0.40
super_quick_ratio	0.017008	-0.209308	0.354314	0.263987	-0.214044	0.52	0.40
surplus_reserve_fund_per_share	0.008414	0.061381	0.316521	0.268158	-0.338592	0.56	0.32
total_asset_growth_rate	0.018643	-0.175744	0.529237	0.385668	-0.248980	0.56	0.40
total_asset_turnover_rate	0.010978	0.141707	-0.408405	0.359223	-0.282246	0.68	0.36
total_operating_cost_ttm	0.026451	0.138899	-0.502188	0.422254	-0.228278	0.72	0.40
total_operating_revenue_per_share	0.025851	0.184704	-0.391091	0.521386	-0.284032	0.60	0.48
total_operating_revenue_per_share_ttm	0.026923	0.203856	-0.341704	0.555859	-0.260519	0.72	0.44
total_operating_revenue_ttm	0.027325	0.138784	-0.524532	0.454753	-0.248052	0.72	0.36
total_profit_growth_rate	0.014955	-0.153739	0.151542	0.266453	-0.366292	0.48	0.32
total_profit_to_cost_ratio	0.019184	-0.218573	0.540883	0.347947	-0.255280	0.60	0.44
total_profit_ttm	0.020263	0.110612	-0.407374	0.294813	-0.338419	0.56	0.36
turnover_volatility	0.087401	-0.585333	0.818967	0.437876	-0.091742	0.64	0.52
value_change_profit_ttm	0.011839	0.099951	-0.268988	0.234034	-0.366520	0.52	0.40

关于文章【多因子模型（三）-交易回测（策略收益90%，回撤12%）】的一个测试

因子读取¶

方法¶

获得因子数据方法¶

当前日期¶

第一步：获得因子数据¶

过去两年的交易日【每隔20交易日】¶

开始计算¶

第二步：因子检验¶

读取因子数据¶

获取收益数据¶

获取股票池¶

第二步：因子检验¶

读取数据¶

计算每个因子的评分和筛选结果¶

IC延续性¶

计算分组收益¶

计算指数收益¶

因子有效性测试¶

第三步：因子筛选¶

因子IC和ICIR筛选 quantile¶

IC排序，相关性筛选¶

需要对因子进行去极值、中性化、标准化处理¶

审核消息

该文章已通过审核

全部回复

0/140

热门文章最新文章

热门标签

更多人气分析师

财经资讯

行情数据

	IC	ICIR	测试一	测试二-胜者组	测试二-败者组	测试三-胜者组	测试三-败者组
ACCA	0.012866	0.139050	False	True	False	True	False
AR	0.036718	-0.160995	True	True	False	True	False
ARBR	0.031664	0.314587	True	True	False	True	False
ATR14	0.049819	-0.258804	True	True	False	True	False
ATR6	0.048733	-0.255409	True	True	False	True	False
BR	0.056285	-0.247606	True	True	False	True	False
DAVOL10	0.039361	-0.170452	True	True	False	True	True
DAVOL20	0.036906	-0.170552	True	True	False	True	False
DAVOL5	0.033200	-0.141448	False	True	False	True	False
DEGM	0.001423	0.022272	False	True	False	True	False
EBIT	0.024991	0.126466	True	True	False	True	False
EBITDA	0.023415	0.114423	True	True	False	True	False
Kurtosis120	0.016839	0.094716	False	True	False	True	False
Kurtosis20	0.023001	-0.186492	True	True	False	True	False
Kurtosis60	0.002823	-0.018834	False	True	False	True	False
MAWVAD	0.012968	0.053739	True	True	False	True	False
MLEV	0.029706	0.231814	True	True	False	True	False
OperateNetIncome	0.010827	0.069878	False	True	False	True	False
OperatingCycle	0.007722	-0.076538	False	True	False	True	False
ROAEBITTTM	0.020850	-0.172085	False	True	False	True	False
Skewness120	0.017286	-0.115842	False	True	False	True	False
Skewness20	0.022001	-0.167139	False	True	False	True	False
Skewness60	0.018554	-0.146239	False	True	False	True	False
TVMA20	0.096683	-0.475439	True	True	False	True	True
TVMA6	0.102510	-0.448285	True	True	False	True	True
TVSTD20	0.108780	-0.545369	True	True	False	True	False
TVSTD6	0.097882	-0.454276	True	True	False	True	True
VDEA	0.006973	0.043340	False	True	False	True	False
VDIFF	0.015364	0.114978	False	True	False	True	False
VEMA10	0.005741	-0.025909	False	True	False	True	False
...	...	...	...	...	...	...	...
operating_profit_ttm	0.018949	0.107587	False	True	False	True	False
operating_revenue_growth_rate	0.003884	0.043997	False	True	False	True	False
operating_revenue_per_share	0.026234	0.186531	False	True	False	True	False
operating_revenue_per_share_ttm	0.027035	0.204013	False	True	False	True	False
operating_revenue_ttm	0.027597	0.140095	True	True	False	True	False
operating_tax_to_operating_revenue_ratio_ttm	0.007541	0.087353	False	True	False	True	False
quick_ratio	0.021461	-0.243853	True	True	False	True	False
retained_earnings	0.021211	0.103753	True	True	False	True	False
retained_earnings_per_share	0.014394	0.098722	False	True	False	True	False

	IC	ICIR	测试一	测试二-胜者组	测试二-败者组	测试三-胜者组	测试三-败者组
TVMA20	0.096683	-0.475439	True	True	False	True	True
TVMA6	0.102510	-0.448285	True	True	False	True	True
TVSTD6	0.097882	-0.454276	True	True	False	True	True
financial_expense_ttm	0.032306	0.267826	True	True	False	True	True
money_flow_20	0.096490	-0.462037	True	True	False	True	True
turnover_volatility	0.087401	-0.585333	True	True	False	True	True