来聊一聊Q因子
关于聪明钱,去年3月份左右的时候研究过,不过就是简单的检测因子,有明显分层。于是在此基础上做了个策略。
再来看看因子逻辑:
'''
聪明钱的构建思路:
1、先计算指标s:s = |Rt|/sqrt(Vt) 其中,Rt为第t 分钟的涨跌幅,Vt为第t 分钟的成交量。指标St
的值越大,则表示该分钟的交易越“聪明”。
2、对所取时间段的每分钟s进行排序,取成交量累积占比前20%视为聪明钱交易。报告中是取10天的数据,
那么就是2400分钟的数据。
3、构造聪明钱的情绪因子Q:Q=VWAPsmart/VWAPall;其中,VWAPsmart是聪明钱的成交量加权平均价,
VWAPall是所有交易的成交量加权平均价。
'''
没有深入研究,尝试了机器学习,就出了一个策略。
结果样本外基本上就没啥动静,长期处于回撤期间。。。
本次就再返工研究这个因子是啥情况。。。打算放弃这个因子,如有错误,欢迎指正交流
import pandas as pd
from pandas import DataFrame,Series
import numpy as np
一、计算q因子函数¶
在回测中调用¶
def calculate_q_factor(stock,frequency,count,threshold):
'''
在回测中调用
params:
stock:股票
frequency:数据频率
count:数据长度
threshold:聪明钱比例
return:
q_factor:Q因子
'''
price = attribute_history(stock, count, frequency, ['open','close','volume','money'],df=True)
price.loc[price['volume'] == 0,'index_s'] = 0
price.loc[price['volume'] != 0,'index_s'] = abs((price['close'] - price['open']) / price['open']) / np.sqrt(price['volume'])
price.sort_values(by = ['index_s'],ascending = False,inplace = True)
vol_sum = price['volume'].sum()
mon_sum = price['money'].sum()
price['cum_volume'] = price['volume'].cumsum()
vol_threshold = threshold * vol_sum
price_smart = price[price['cum_volume']<= vol_threshold]
vols_sum = price_smart['volume'].sum()
mons_sum = price_smart['money'].sum()
# VWAPs000001.XSHEmart是聪明钱的成交量加权平均价,VWAPall是所有交易的成交量加权平均价。
if vol_sum == 0:
VWAPall = VWAPsmart = np.NaN
else:
VWAPall = mon_sum/vol_sum
if vols_sum == 0:
vols_sum = price['cum_volume'].iloc[0]
VWAPsmart = mons_sum/vols_sum
if VWAPsmart == np.NaN or VWAPall == np.NaN:
q = np.NaN
else:
q = VWAPsmart/VWAPall
return q
在研究中调用¶
def calculate_q_factor(stock,frequency,count,threshold,end_date,num_try = 0):
'''
在研究中调用
params:
stock:股票
frequency:数据频率
count:数据长度
threshold:聪明钱比例
return:
q_factor:Q因子
'''
try:
price = get_price(stock,fields = ['open','close','volume','money'],end_date=end_date,count = count, frequency=frequency)
price.loc[price['volume'] == 0,'index_s'] = 0
price.loc[price['volume'] != 0,'index_s'] = abs((price['close'] - price['open']) / price['open']) / np.sqrt(price['volume'])
price.sort_values(by = ['index_s'],ascending = False,inplace = True)
vol_sum = price['volume'].sum()
mon_sum = price['money'].sum()
price['cum_volume'] = price['volume'].cumsum()
vol_threshold = threshold * vol_sum
price_smart = price[price['cum_volume']<= vol_threshold]
vols_sum = price_smart['volume'].sum()
mons_sum = price_smart['money'].sum()
# VWAPs000001.XSHEmart是聪明钱的成交量加权平均价,VWAPall是所有交易的成交量加权平均价。
if vol_sum == 0:
VWAPall = VWAPsmart = np.NaN
else:
VWAPall = mon_sum/vol_sum
if vols_sum == 0:
vols_sum = price['cum_volume'].iloc[0]
VWAPsmart = mons_sum/vols_sum
if VWAPsmart == np.NaN or VWAPall == np.NaN:
q = np.NaN
else:
q = VWAPsmart/VWAPall
return q
except Exception as e:
num_try += 1
if num_try <5:
return calculate_q_factor(stock,frequency,count,threshold,end_date,num_try)
二、查看q值对参数的敏感性¶
测算不同时间频率、不同聪明钱比例的q值¶
timeDict = {'1':int(240/1),
'3':int(240/3),
'5':int(240/5),
'7':int(240/7),
'9':int(240/9),
'11':int(240/11),
'13':int(240/13),
'15':int(240/15),
'17':int(240/17),
'19':int(240/19),
'21':int(240/21)
}
N = 20
import matplotlib.pyplot as plt
import pandas as pd
横轴是不同的时间频率,纵轴是不同的聪明钱比例参数¶
f = plt.figure(figsize=(8,120))
dfs = {}
for threshold in range(2,10,1):
threshold = threshold/20
df = {}
for key,value in timeDict.items():
df[key] = calculate_q_factor('000001.XSHE','{}m'.format(key),int(N*value),threshold,datetime.datetime.now().date())
dfs[threshold] = df
dfs = pd.DataFrame(dfs)
dfs.index = [int(x) for x in dfs.index.values]
dfs.sort_index(ascending= True,inplace = True)
dfs.plot(figsize=(20,10))
结果上来看,不同频率的时间波动比较大,说明q因子计算对选定的时间很敏感,另外只有1m的时候,不论定义的聪明钱比例是多少,都是小于1的,其他时间频率是没有这个特征。所以这个因子用起来可能存在参数拟合的因素。¶
三、插件¶
文件保存¶
def save_file(filename,data,mode = True):
import pickle
if mode:
with open(filename,'wb') as f:
pickle.dump(data,f)
elif not mode:
with open(filename,'rb') as f:
data = pickle.load(f)
return data
获取交易时间¶
# 计算一段时间每个月的开始和最后一个交易日
def calculate_FL(time_list):
time_list_df = pd.DataFrame(time_list,columns=['time'])
time_list_df['time_str'] = time_list_df['time'].apply(lambda x:datetime.datetime.strftime(x,'%Y-%m-%d'))
time_list_df['year'] = time_list_df['time_str'].apply(lambda x:int(x.split('-')[0]))
time_list_df['month'] = time_list_df['time_str'].apply(lambda x:int(x.split('-')[1]))
time_list_df['day'] = time_list_df['time_str'].apply(lambda x:int(x.split('-')[2]))
time_list_df['cum_year'] = time_list_df['year']-time_list_df['year'].iloc[0]
time_list_df['cum_month'] = time_list_df['cum_year']*12 + time_list_df['month']
time_list_df['diff_month'] = time_list_df['cum_month'].diff()
time_list_df['diff_shift_month'] = time_list_df['diff_month'].shift(-1)
trade_end = list(time_list_df[time_list_df['diff_shift_month']==1]['time_str'].values)
trade_start = list(time_list_df[time_list_df['diff_month'] == 1]['time_str'].values)
trade_start.append(time_list_df['time_str'].iloc[0])
trade_start = sorted(trade_start)
trade_end.append(time_list_df['time_str'].iloc[-1])
return trade_start,trade_end
剔除涨跌停股票¶
def delect_stop(stocks,beginDate,n=30*2):
stockList=[]
beginDate = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
for stock in stocks:
start_date=get_security_info(stock).start_date
if start_date<(beginDate-datetime.timedelta(days=n)).date():
stockList.append(stock)
return stockList
#获取股票池
def get_stock(stockPool,begin_date):
if stockPool=='HS300':
stockList=get_index_stocks('000300.XSHG',begin_date)
elif stockPool=='ZZ500':
stockList=get_index_stocks('399905.XSHE',begin_date)
elif stockPool=='ZZ800':
stockList=get_index_stocks('399906.XSHE',begin_date)
elif stockPool=='CYBZ':
stockList=get_index_stocks('399006.XSHE',begin_date)
elif stockPool=='ZXBZ':
stockList=get_index_stocks('399005.XSHE',begin_date)
elif stockPool=='A':
stockList=get_index_stocks('000002.XSHG',begin_date)+get_index_stocks('399107.XSHE',begin_date)
#剔除ST股
st_data=get_extras('is_st',stockList, count = 1,end_date=begin_date)
stockList = [stock for stock in stockList if not st_data[stock][0]]
#剔除停牌、新股及退市股票
stockList=delect_stop(stockList,begin_date)
return stockList
四、计算每一期的q值¶
# 计算每一期的股票的池,key为时间,value为所有股票当期的因子值
q_dicts = {}
frequency = '1m'
count = 2400
threshold = 0.1
# 获取每个月的最后一个交易日
from jqdata import *
year_list = ['2010','2011','2012','2013','2014','2015','2016','2017']
for i in year_list:
year_start = str(int(i)-1)
tradeTimeList = get_trade_days(start_date='{}-12-31'.format(year_start), end_date='{}-12-31'.format(i), count=None)
FL = calculate_FL(tradeTimeList)
monthFisrtDay = FL[0]
monthLastDay = FL[1]
print(monthLastDay)
for j in range(len(monthLastDay)):
tradeT = monthLastDay[j]
q_dict = {}
stockPool = get_stock('A',tradeT) # 过滤后的股票池
for stk in stockPool:
q_dict[stk] = calculate_q_factor(stk,frequency,count,threshold,tradeT)
q_dicts[tradeT] = q_dict
# 存储计算的q因子值
df_qdict = pd.DataFrame(q_dicts)
df_qdict.to_csv('all_q_dicts_{}.csv'.format(i))
tradeTimeList = get_trade_days(start_date='{}-12-31'.format(2017), end_date='{}-11-30'.format(2018), count=None)
FL = calculate_FL(tradeTimeList)
monthFisrtDay = FL[0]
monthLastDay = FL[1]
for j in range(len(monthLastDay)):
tradeT = monthLastDay[j]
q_dict = {}
stockPool = get_stock('A',tradeT) # 过滤后的股票池
for stk in stockPool:
q_dict[stk] = calculate_q_factor(stk,frequency,count,threshold,tradeT)
q_dicts[tradeT] = q_dict
# 存储计算的q因子值
df_qdict = pd.DataFrame(q_dicts)
df_qdict.to_csv('all_q_dicts_{}.csv'.format(2018))
五、分组回测¶
import pandas as pd
# 读取数据
df_dict_2010 = pd.read_csv('all_q_dicts_2010.csv',index_col=0)
df_dict_2011 = pd.read_csv('all_q_dicts_2011.csv',index_col=0)
df_dict_2012 = pd.read_csv('all_q_dicts_2012.csv',index_col=0)
df_dict_2013 = pd.read_csv('all_q_dicts_2013.csv',index_col=0)
df_dict_2014 = pd.read_csv('all_q_dicts_2014.csv',index_col=0)
df_dict_2015 = pd.read_csv('all_q_dicts_2015.csv',index_col=0)
df_dict_2016 = pd.read_csv('all_q_dicts_2016.csv',index_col=0)
df_dict_2017 = pd.read_csv('all_q_dicts_2017.csv',index_col=0)
df_dict_2018 = pd.read_csv('all_q_dicts_2018.csv',index_col=0)
df_dict_2018.head()
dateList = df_dict_2018.columns.values
dateList
计算下个月的起止时间¶
import calendar
def calculate_next_month(day):
current_year = day.year
next_month_number = day.month + 1
if next_month_number == 13:
next_month_number = 1
current_year = current_year + 1
# 计算下个月的起止时间
next_month_start = datetime.datetime.strptime('{}-{}-1'.format(current_year,next_month_number),'%Y-%m-%d')
days = calendar.monthrange(next_month_start.year, next_month_start.month)[1]
next_month_end = next_month_start+ datetime.timedelta(days-1)
return next_month_start,next_month_end
计算股票池平均收益¶
def calculate_class_rts(stocklist,start_date,end_date):
rts_list = []
for stk in stocklist:
price = get_price(stk, start_date=start_date, end_date=end_date, frequency='daily', fields=['close','open'])
rts = price['close'].iloc[-1]/price['close'].iloc[0]-1
rts_list.append(rts)
rts_list = pd.Series(rts_list)
rts_list.dropna(axis = 0,inplace = True)
mean = rts_list.sum()/len(rts_list)
return mean
classA = {}
classB = {}
classC = {}
classD = {}
classE = {}
for i in range(len(dateList)-1):
date = dateList[i]
date_datetime = datetime.datetime.strptime(date,'%Y-%m-%d').date()
# 计算下个月的起止时间
next_month_start,next_month_end = calculate_next_month(date_datetime)
# 根据因子分组
q_factors = df_dict_2018[date]
q_factors.dropna(axis = 0,inplace = True)
q_factors = q_factors.copy()
q_factors.sort_values(ascending = True,inplace = True)
stockListLast=list(q_factors.index.values)
lens = len(stockListLast)
q_A = list(stockListLast[:int(0.2*lens)])
q_B = list(stockListLast[int(0.2*lens):int(0.4*lens)])
q_C = list(stockListLast[int(0.4*lens):int(0.6*lens)])
q_D = list(stockListLast[int(0.6*lens):int(0.8*lens)])
q_E = list(stockListLast[int(0.8*lens):])
classA[date] = calculate_class_rts(q_A,next_month_start,next_month_end)
classB[date] = calculate_class_rts(q_B,next_month_start,next_month_end)
classC[date] = calculate_class_rts(q_C,next_month_start,next_month_end)
classD[date] = calculate_class_rts(q_D,next_month_start,next_month_end)
classE[date] = calculate_class_rts(q_E,next_month_start,next_month_end)
df = pd.DataFrame({'A':classA,
'B':classB,
'C':classC,
'D':classD,
'E':classE})
df.head()
(df+1).cumprod().plot(figsize = (20,8))
单调性并不明显。。。主要是最小组的表现比较意外。¶