5.3 自动特征生成与选择

下面将使用tsfresh包演示如何进行自动特征生成和特征选择

from tsfresh.examples.robot_execution_failures import download_robot_execution_failures,load_robot_execution_failures
from tsfresh import extract_features,select_features
import pandas as pd

下一步需要注意,由于国内网络的限制,直接运行时会导致连接失败,此时有两个办法 1)在该地址 https://github.com/MaxBenChrist/robot-failure-dataset 手动下载lp1.data 2)在网站https://www.ipaddress.com 输入https://raw.githubusercontent.com 的真实ip,然后在C:\Windows\System32\drivers\etc下的hosts文件中添加类似这样的几行185.199.108.133 raw.githubusercontent.com

download_robot_execution_failures() #下载数据
timeseries, y = load_robot_execution_failures() # 加载数据
timeseries.columns #该数据集包含8列,其中id表明类别id,time为时间轴,其他6列为不同维度的时间序列值
Index(['id', 'time', 'F_x', 'F_y', 'F_z', 'T_x', 'T_y', 'T_z'], dtype='object')
# 自动抽取全部特征
X_extracted = extract_features(timeseries,column_id = "id",column_sort = "time")
X_extracted

T_x__variance_larger_than_standard_deviation

T_x__has_duplicate_max

T_x__has_duplicate_min

T_x__has_duplicate

T_x__sum_values

T_x__abs_energy

T_x__mean_abs_change

T_x__mean_change

T_x__mean_second_derivative_central

T_x__median

...

F_z__permutation_entropy__dimension_5__tau_1

F_z__permutation_entropy__dimension_6__tau_1

F_z__permutation_entropy__dimension_7__tau_1

F_z__query_similarity_count__query_None__threshold_0.0

F_z__matrix_profile__feature_"min"__threshold_0.98

F_z__matrix_profile__feature_"max"__threshold_0.98

F_z__matrix_profile__feature_"mean"__threshold_0.98

F_z__matrix_profile__feature_"median"__threshold_0.98

F_z__matrix_profile__feature_"25"__threshold_0.98

F_z__matrix_profile__feature_"75"__threshold_0.98

1

0.0

1.0

1.0

1.0

-43.0

125.0

0.214286

0.071429

0.038462

-3.0

...

1.972247

2.163956

2.197225

NaN

NaN

NaN

NaN

NaN

NaN

NaN

2

1.0

1.0

1.0

1.0

-53.0

363.0

3.785714

-0.071429

0.153846

-3.0

...

2.397895

2.302585

2.197225

NaN

NaN

NaN

NaN

NaN

NaN

NaN

3

1.0

0.0

1.0

1.0

-60.0

344.0

3.214286

0.071429

-0.076923

-5.0

...

2.397895

2.302585

2.197225

NaN

NaN

NaN

NaN

NaN

NaN

NaN

4

1.0

1.0

0.0

1.0

-93.0

763.0

3.714286

-0.428571

-0.192308

-6.0

...

2.271869

2.302585

2.197225

NaN

NaN

NaN

NaN

NaN

NaN

NaN

5

1.0

0.0

0.0

1.0

-105.0

849.0

4.071429

-0.357143

0.000000

-8.0

...

2.271869

2.302585

2.197225

NaN

NaN

NaN

NaN

NaN

NaN

NaN

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

84

1.0

0.0

0.0

1.0

5083.0

1825597.0

18.857143

15.285714

-0.538462

394.0

...

1.366711

1.609438

1.831020

NaN

NaN

NaN

NaN

NaN

NaN

NaN

85

1.0

0.0

0.0

1.0

-511.0

18023.0

2.785714

-1.214286

0.192308

-33.0

...

1.972247

2.163956

2.197225

NaN

NaN

NaN

NaN

NaN

NaN

NaN

86

1.0

0.0

0.0

1.0

-987.0

67981.0

3.928571

-3.500000

-0.153846

-65.0

...

0.600166

0.639032

0.683739

NaN

NaN

NaN

NaN

NaN

NaN

NaN

87

1.0

0.0

0.0

1.0

-1921.0

247081.0

6.642857

-0.357143

0.461538

-126.0

...

1.366711

1.609438

1.831020

NaN

NaN

NaN

NaN

NaN

NaN

NaN

88

1.0

1.0

0.0

1.0

-304.0

6408.0

2.428571

-0.714286

0.230769

-21.0

...

2.397895

2.302585

2.197225

NaN

NaN

NaN

NaN

NaN

NaN

NaN

# 选择性生成特征 
fc_parameters = { "length": None, "large_standard_deviation": [{"r": 0.05}, {"r": 0.1}] } 
extract_features(timeseries, column_id = "id",column_sort = "time",default_fc_parameters=fc_parameters)

F_x__length

F_x__large_standard_deviation__r_0.05

F_x__large_standard_deviation__r_0.1

F_y__length

F_y__large_standard_deviation__r_0.05

F_y__large_standard_deviation__r_0.1

F_z__length

F_z__large_standard_deviation__r_0.05

F_z__large_standard_deviation__r_0.1

T_x__length

T_x__large_standard_deviation__r_0.05

T_x__large_standard_deviation__r_0.1

T_y__length

T_y__large_standard_deviation__r_0.05

T_y__large_standard_deviation__r_0.1

T_z__length

T_z__large_standard_deviation__r_0.05

T_z__large_standard_deviation__r_0.1

1

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

0.0

0.0

2

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

3

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

4

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

5

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

...

84

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

85

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

86

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

87

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

88

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

15.0

1.0

1.0

# 自动特征选择 
X_extracted_cols = X_extracted.isnull().sum().where(lambda x : x==0).dropna().index # 由于不是所有生成的变量都是有意义的,删除掉包含NA的特征,这也是特征选择函数的要求 
X_selected = select_features(X_extracted[X_extracted_cols], y) 
# fresh算法自动从2203个特征中选择出了665个 
print('count of raw feature: {}'.format(len(X_extracted_cols))) 
print('count of auto-selected feature: {}'.format(len(X_selected.columns))) 
count of raw feature: 2203 
count of auto-selected feature: 665 

官方文档的这个流程展示了用fresh算法进行特征选择的思路,简单来说,它是通过比较不同时间序列类别下特征的显著性差异来确定是否要挑选出这个特征。

除此之外,还有其他用于特征选择的方法,如recursive feature elimination (RFE),不过tsfresh包并没有内置这种方法,可以结合sklearn中的RFE方法自行组合使用。

Last updated