繁簡切換您正在訪問的是FX168財經網,本網站所提供的內容及信息均遵守中華人民共和國香港特別行政區當地法律法規。

FX168财经网>人物频道>帖子

用 IC 评价因子效果靠谱吗?-(利用分组或加权来提高IC准确度)

作者/sdjfshd 2019-09-07 20:00 0 来源: FX168财经网人物频道
# 第二步-因子检验
import time
import datetime
import jqdata
import datetime
from multiprocessing.dummy import Pool as ThreadPool
from jqfactor import Factor,calc_factors
import pandas as pd
import statsmodels.api as sm
import scipy.stats as st
import pickle
pkl_file = open('MyPackage.pkl', 'rb')
load_Package = pickle.load(pkl_file)
g_univ_dict,return_df,all_return_df,raw_factor_dict,all_factor_dict,all_industry_df=load_Package

univ_dict=g_univ_dict
# Step II: 因子筛选用到的函数
def ic_calculator(factor,return_df,univ_dict):
    ic_list=[]
    p_value_list=[]
    for date in sorted(list(univ_dict.keys())):   #这里是循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
        if len(univ)<10:
            continue
        factor_se=factor.loc[date,univ]
        return_se=return_df.loc[date,univ]
        ic,p_value=st.spearmanr(factor_se,return_se)
        ic_list.append(ic)
        p_value_list.append(p_value)
    return ic_list

def weighted_ic_calculator(factor,return_df,univ_dict,w=0.95):
    ic_list=[]
    p_value_list=[]
    for date in sorted(list(univ_dict.keys())):   #这里是循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
        if len(univ)<10:
            continue
        factor_se=factor.loc[date,univ]
        return_se=return_df.loc[date,univ]            
        df=pd.concat([factor_se.to_frame('factor'),return_se.to_frame('ret')],axis=1)
        ic,p_value=st.spearmanr(factor_se,return_se)
        signal=False if ic>0 else True
        df=df.sort('factor',ascending=signal)
        N=len(df)
        
        weight=w**np.arange(N)/sum(w**np.arange(N))
        df['weight']=weight
        A1=sum(df['weight']*df['factor']*df['ret'])
        A2=sum(df['weight']*df['factor'])
        A3=sum(df['weight']*df['ret'])
        B1=sum(df['weight']*df['factor']**2)
        B2=sum(df['weight']*df['ret']**2)
        weighted_ic=(A1-A2*A3)/(sqrt(B1-A2**2)*sqrt(B2-A3**2))
        ic_list.append(weighted_ic)
        
    return ic_list

def grouped_ic_calculator(factor,return_df,univ_dict,Group=20):
    ic_list=[]
    p_value_list=[]
    for date in sorted(list(univ_dict.keys())):   #这里是循环
        univ=univ_dict[date]
        univ=list(set(univ)&set(factor.loc[date].dropna().index)&set(return_df.loc[date].dropna().index))
        if len(univ)<10:
            continue
        factor_se=factor.loc[date,univ]
        return_se=return_df.loc[date,univ]    
        df=pd.concat([factor_se.to_frame('factor'),return_se.to_frame('ret')],axis=1)
        #ic,p_value=st.spearmanr(factor_se,return_se)
        #signal=False if ic>0 else True
        df=df.sort('factor',ascending=True)
        N=len(df)
        factor_grouped_list=[]
        ret_grouped_list=[]
        
        for i in arange(Group):            
            factor_grouped_list.append(df.ix[int(round(i/Group*N)):int(round((i+1)/Group*N-1)),'factor'].mean())
            ret_grouped_list.append(df.ix[int(round(i/Group*N)):int(round((i+1)/Group*N-1)),'ret'].mean())
        VCV=cov(np.array(ret_grouped_list),factor_grouped_list)
        grouped_ic=VCV[0,1]/sqrt(VCV[0,0]*VCV[1,1])        
        ic_list.append(grouped_ic)        
    return ic_list

starttime=time.clock()

print('\n计算IC:')
count=1
ic_list_dict={}
for key,factor in all_factor_dict.items():
    ic_list=ic_calculator(factor,return_df,univ_dict)
    ic_list_dict[key]=ic_list
    print(count,end=',')
    count=count+1
    
ic_df=pd.DataFrame(ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
ic_df.mean().abs().hist()

print('\n计算Weighted_IC:')
count=1
weighted_ic_list_dict={}
for key,factor in all_factor_dict.items():
    weighted_ic_list=weighted_ic_calculator(factor,return_df,univ_dict)
    weighted_ic_list_dict[key]=weighted_ic_list
    print(count,end=',')
    count=count+1
    
weighted_ic_df=pd.DataFrame(weighted_ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
weighted_ic_df.mean().abs().hist()


print('\n计算Grouped_IC:')
count=1
grouped_ic_list_dict={}
for key,factor in all_factor_dict.items():
    grouped_ic_list=grouped_ic_calculator(factor,return_df,univ_dict)
    grouped_ic_list_dict[key]=grouped_ic_list
    print(count,end=',')
    count=count+1
    
grouped_ic_df=pd.DataFrame(grouped_ic_list_dict,index=sorted(list(univ_dict.keys()))[:-1])
grouped_ic_df.mean().abs().hist()

endtime=time.clock()
runtime=endtime-starttime
print('因子生成运行完成,用时 %.2f 秒' % runtime)
计算IC:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
计算Weighted_IC:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
计算Grouped_IC:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,因子生成运行完成,用时 122.53 秒
ic_df.mean().abs().hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc2b9926a58>
weighted_ic_df.mean().abs().hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc2bbf35f28>
grouped_ic_df.mean().abs().hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7fc2a7d86978>
# 如果你没有因子数据,那么就先运行第一步-因子生成,大约需要18分钟。
import time
import jqdata
import datetime
from multiprocessing.dummy import Pool as ThreadPool
from jqfactor import Factor,calc_factors
import pandas as pd
import statsmodels.api as sm
import scipy.stats as st
from jqfactor import get_factor_values
from jqfactor import winsorize,winsorize_med,neutralize,standardlize
import pickle

import xlrd   # 手工输入156个因子太麻烦,所以我就在EXCEL里上传了,也可手工输入。
ExcelFile=xlrd.open_workbook('FactorTable.xlsx')
name=ExcelFile.sheet_names()
sheet=ExcelFile.sheet_by_name(name[0])
factor_quality=list(sheet.col_values(1))
factor_fundamental=list(sheet.col_values(2))[:28]
factor_mood=list(sheet.col_values(3))[:35]
factor_growth=list(sheet.col_values(4))[:8]
factor_risk=list(sheet.col_values(5))[:12]
factor_stock=list(sheet.col_values(6))[:15]

starttime=time.clock()

global g_index
global g_count
global g_factor_list
global g_univ_dict
global g_neu_factor

g_index='000300.XSHG'
g_count=500
g_factor_list=factor_quality+factor_fundamental+factor_mood+factor_growth+factor_risk+factor_stock
g_neu_factor=factor_quality+factor_fundamental+factor_growth+factor_stock

def get_trade_dates(end,count=250,interval=20):
    date_list=list(jqdata.get_trade_days(end_date=end,count=count))
    date_list=date_list[::-1]
    date_list=list(filter(lambda x:date_list.index(x)%interval==0,date_list))
    date_list=date_list[::-1]
    return date_list

def get_stock_pool(date,index='all'):                    
    df=get_all_securities(types=['stock'],date=date)
    dayBefore=jqdata.get_trade_days(end_date=date,count=60)[0]      #上市不足60天
    df=df[df['start_date']<dayBefore]                               #上市不足count天的去掉
    universe_pool=list(df.index)
    if index=='all':
        stock_pool=universe_pool
    else:
        index_pool=get_index_stocks(index,date=date)
        stock_pool=list(set(index_pool)&set(universe_pool))
    return stock_pool

def get_stock_universe(trade_date_list,index='all'):               
    univ_list=[]
    univ_dict={}
    for date in trade_date_list:
        stock_pool=get_stock_pool(date,index)
        univ_list.append(stock_pool)
        univ_dict[date]=stock_pool
    return univ_list,univ_dict

def get_return(trade_date_list,count=250):     #小概率风险:一个股票曾经是指数成分股而如今已经退市      
    date=max(trade_date_list)
    universe=get_stock_pool(date,index='all')
    price=get_price(universe,end_date=date,count=count,fields=['close'],fq='pre')['close']
    return_df=price.loc[trade_date_list].pct_change().shift(-1)
    #return_df.index=dateTransform(return_df.index)
    all_return_df=price.pct_change().shift(-1)
    return return_df,all_return_df

def get_jq_factor_by_day(date):
    factor_dict=get_factor_values(securities=g_univ_dict[date], factors=g_factor_list, start_date=date, end_date=date)
    return factor_dict

def get_raw_factor_dict1(trade_date_list):
    raw_factor_dict={}
    # preset dict
    for factor in g_factor_list:
        raw_factor_dict[factor]=pd.DataFrame()

    # concate the factors
    for date in trade_date_list:
        all_factor_by_day=get_jq_factor_by_day(date)
        for factor in g_factor_list:
            raw_factor_dict[factor]=pd.concat([raw_factor_dict[factor],all_factor_by_day[factor]])
            
    return raw_factor_dict

def get_raw_factor_dict(trade_date_list):
    pool=ThreadPool(processes=len(trade_date_list))
    frame_list=pool.map(get_jq_factor_by_day,trade_date_list)
    pool.close()
    pool.join()
    raw_factor_dict={}
    count=0
    for factor in g_factor_list:
        y=[x[factor] for x in frame_list]
        y=pd.concat(y,axis=0)
        #y.index=dateTransform(y.index)                           ************************
        raw_factor_dict[factor]=y
        count=count+1
        print(count,end=',')
    return raw_factor_dict

def get_Industry_by_day(date):                                
    industry_set = ['801010', '801020', '801030', '801040', '801050', '801080', '801110', '801120', '801130', 
                  '801140', '801150', '801160', '801170', '801180', '801200', '801210', '801230', '801710',
                  '801720', '801730', '801740', '801750', '801760', '801770', '801780', '801790', '801880','801890']
    industry_df = pd.DataFrame(index=[date],columns=g_univ_dict[date])
    for industry in industry_set:
        industry_stocks = get_industry_stocks(industry,date = date)
        industry_stocks = list(set(industry_stocks)&set(g_univ_dict[date]))
        industry_df.loc[date,industry_stocks] = industry
    return industry_df

def get_industry_df(trade_date_list):    
    all_industry_df=pd.DataFrame()
    count=1
    for date in trade_date_list:
        all_industry_df=pd.concat([all_industry_df,get_Industry_by_day(date)],axis=0)
        print(count,end=',')
        count=count+1
    return all_industry_df

def replace_nan_indu(all_industry_df,factor_df,univ_dict):
    fill_factor=pd.DataFrame()
    for date in list(univ_dict.keys()):
        univ=univ_dict[date]
        factor_by_day=factor_df.loc[date,univ].to_frame('values')
        industry_by_day=all_industry_df.loc[date,univ].dropna().to_frame('industry')  #和后面的inner去除掉了没有行业的股票
        factor_by_day=factor_by_day.merge(industry_by_day,left_index=True,right_index=True,how='inner')
        mid=factor_by_day.groupby('industry').median()
        factor_by_day=factor_by_day.merge(mid,left_on='industry',right_index=True,how='left')
        factor_by_day.loc[pd.isnull(factor_by_day['values_x']),'values_x']=factor_by_day.loc[pd.isnull(factor_by_day['values_x']),'values_y']
        fill_factor=fill_factor.append(factor_by_day['values_x'].to_frame(date).T)
    return fill_factor

def pretreat_factor(factor_df,g_univ_dict,neu):
    pretreat_factor_df=pd.DataFrame(index=list(factor_df.index),columns=list(factor_df.columns))
    for date in sorted(list(g_univ_dict.keys())):
        factor_se=factor_df.loc[date,g_univ_dict[date]].dropna()
        factor_se=winsorize_med(factor_se, scale=3, inclusive=True, inf2nan=True, axis=1)   # winsorize
        if neu:
            factor_se=neutralize(factor_se, how=['jq_l1', 'market_cap'], date=date, axis=1)     # neutralize
        factor_se=standardlize(factor_se, inf2nan=True, axis=0)                             # standardize
        pretreat_factor_df.loc[date,list(factor_se.index)]=factor_se
    return pretreat_factor_df

def get_all_factor_dict(raw_factor_dict,g_univ_dict,all_industry_df):
    all_factor_dict={}
    count=0
    for key,raw_factor_df in raw_factor_dict.items():
        #把nan用行业中位数代替,依然会有nan,比如说整个行业没有该项数据,或者该行业仅有此一只股票,且为nan。
        factor_df=replace_nan_indu(all_industry_df,raw_factor_df,g_univ_dict)
        neu=True if key in g_neu_factor else False
        factor_df=pretreat_factor(factor_df,g_univ_dict,neu)
        all_factor_dict[key]=factor_df
        count=count+1
        print(count,end=',')
    return all_factor_dict

print('开始运行...')
today=datetime.date.today()                                               
yesterday=jqdata.get_trade_days(end_date=today,count=10)[0]                   # 获取回测最后一天日期
print('获取时间序列')
trade_date_list=get_trade_dates(yesterday,g_count,20)                        # 将用于计算的时间序列
print('获取股票池')
univ_list,g_univ_dict=get_stock_universe(trade_date_list,index=g_index)      # 获取股票池
print('获取历史回报')
return_df,all_return_df=get_return(trade_date_list,count=g_count)           # 获得所有股票的历史回报  (all stocks)
print('获取因子,共计%d个,进度:' % len(g_factor_list))
raw_factor_dict=get_raw_factor_dict(trade_date_list)
print('\n获取行业数据')
all_industry_df=get_industry_df(trade_date_list)
print('\n处理数据---去极值化/中性化/标准化,共计%d个,进度:'% len(g_factor_list))
all_factor_dict=get_all_factor_dict(raw_factor_dict,g_univ_dict,all_industry_df)
print('\npickle序列化')
Package=[g_univ_dict,return_df,all_return_df,raw_factor_dict,all_factor_dict,all_industry_df]
pkl_file = open('MyPackage.pkl', 'wb')
pickle.dump(Package,pkl_file,0)
pkl_file.close()
endtime=time.clock()
runtime=endtime-starttime
print('因子生成运行完成,用时 %.2f 秒' % runtime)
开始运行...
获取时间序列
获取股票池
获取历史回报
获取因子,共计156个,进度:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
获取行业数据
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,处理数据---去极值化/中性化/标准化,共计156个,进度:
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,
pickle序列化
因子生成运行完成,用时 1078.49 秒
 
分享到:
举报财经168客户端下载

全部回复

0/140

投稿 您想发表你的观点和看法?

更多人气分析师

  • 金算盘

    人气2696文章7761粉丝124

    高级分析师,混过名校,厮杀于股市和期货、证券市场多年,专注...

  • 李冉晴

    人气2296文章3821粉丝34

    李冉晴,专业现贷实盘分析师。

  • 张迎妤

    人气1896文章3305粉丝34

    个人专注于行情技术分析,消息面解读剖析,给予您第一时间方向...

  • 指导老师

    人气1856文章4423粉丝52

    暂无个人简介信息

  • 梁孟梵

    人气2152文章3177粉丝39

    qq:2294906466 了解群指导添加微信mfmacd

  • 刘钥钥1

    人气2016文章3119粉丝34

    专业从事现货黄金、现货白银模似实盘操作分析指导

  • 张亦巧

    人气2144文章4145粉丝45

    暂无个人简介信息

  • 金帝财神

    人气4720文章8329粉丝118

    本文由资深分析师金帝财神微信:934295330,指导黄金,白银,...

  • 金泰铬J

    人气2320文章3925粉丝51

    投资问答解咨询金泰铬V/信tgtg67即可获取每日的实时资讯、行情...