下面是一个可用于特征选择的简单 GA 代码。它使用了功能强大的 deap 库,但学习曲线可能比较陡峭。不过,这个简单的版本应该足够清晰。
上下滑动查看更多源码
1.
# to maximize the objective# fitness_weights = 1.0# to minimize the objective
fitness_weights =-1.0# copy the original dataframes into local copies, once
X_ga = X.copy()
y_ga = y.copy()# 'const' (the first column) is not an actual feature, do not include it
X_features = X_ga.columns.to_list()[1:]
try:
del creator.FitnessMax
del creator.Individual
except Exception as e:
pass
creator.create("FitnessMax", base.Fitness, weights=(fitness_weights,))
creator.create("Individual", array.array, typecode='b', fitness=creator.FitnessMax
)
try:
del toolbox
except Exception as e:
pass
toolbox = base.Toolbox()# Attribute generator
toolbox.register("attr_bool", random.randint,0,1)# Structure initializers
toolbox.register("individual",
tools.initRepeat,
creator.Individual,
toolbox.attr_bool,len(X_features),)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
def evalOneMax(individual):
# objective function# create True/False selector list for features# and add True at the start for 'const'
cols_select =[True]+[i ==1for i in list(individual)]# fit model using the features selected from the individual
lin_mod = sm.OLS(y_ga, X_ga.loc[:, cols_select], hascnotallow=True).fit()return(lin_mod.bic,)
toolbox.register("evaluate", evalOneMax)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
random.seed(0)
pop = toolbox.population(n=300)
hof = tools.HallOfFame(1)
pop, log = algorithms.eaSimple(
pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=10, halloffame=hof, verbose=True)
best_individual_ga_small = list(hof[0])
best_features_ga_small =[
X_features[i]for i, val in enumerate(best_individual_ga_small)if val ==1]
best_objective_ga_small =(
sm.OLS(y_ga, X_ga[['const']+ best_features_ga_small], hascnotallow=True).fit().bic
)print(f'best objective: {best_objective_ga_small}')print(f'best features: {best_features_ga_small}')
这个简单的代码很容易理解,但效率很低。下面有更复杂版本的 GA 代码,更复杂、更优化的代码 1000 代后。
# to maximize the objective# fitness_weights = 1.0# to minimize the objective
fitness_weights =-1.0# copy the original dataframes into local copies, once
X_ga = X.astype(float).copy()
y_ga = y.astype(float).copy()# 'const' (the first column) is not an actual feature, do not include it
X_features = X_ga.columns.to_list()[1:]
try:
del creator.FitnessMax
del creator.Individual
except Exception as e:
pass
creator.create("FitnessMax", base.Fitness, weights=(fitness_weights,))
creator.create("Individual", array.array, typecode='b', fitness=creator.FitnessMax
)
try:
del toolbox
except Exception as e:
pass
toolbox = base.Toolbox()# Attribute generator
toolbox.register("attr_bool", random.randint,0,1)# Structure initializers
toolbox.register("individual",
tools.initRepeat,
creator.Individual,
toolbox.attr_bool,len(X_features),)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
def evalMany(individuals):
# individuals is an array of shape (n_individuals, n_features)# transform it into a list of lists:# - a list of individuals# - each individual is a list of feature selectors
ind_list =[list(i)for i in list(individuals)]
ret =[]for ind in ind_list:
# create a list of True/False feature selectors from each individual# pre-pend True to always select the 'const' feature
cols_select =[True]+[i ==1for i in list(ind)]# fit model using the features selected from the individual
lin_mod = sm.OLS(y_ga, X_ga.loc[:, cols_select], hascnotallow=True).fit()
ret.append((lin_mod.bic,))return ret
# multiprocess pool to evaluate individuals
def joblib_map(f, njobs,*iters):
return Parallel(n_jobs=njobs)(delayed(f)(*args)for args in zip(*iters))
def selElitistAndTournament(
individuals, k_tournament, k_elitist=0, tournsize=3):
# elitist tournament# in addition to the regular tournament, ensure the top #k_elistist individuals are preservedreturn tools.selBest(individuals, k_elitist)+ tools.selTournament(
individuals, k_tournament, tournsize
)# Hyperparameters
population_size =1000
crossover_probability =0.5
individual_mutation_probability =0.2
gene_mutation_probability =0.1
tournament_size =3
elite_size =0
n_jobs = os.cpu_count()# register the pool as the mapper# and the custom function as the evaluator
toolbox.register("map_multi", joblib_map)
toolbox.register("evaluate", evalMany)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=gene_mutation_probability)# toolbox.register("select", tools.selTournament, tournsize=tournament_size)# selection with tournament and optional elitism
toolbox.register("select",
selElitistAndTournament,
k_tournament=population_size - elite_size,
k_elitist=elite_size,
tournsize=tournament_size,)
random.seed(0)
population = toolbox.population(n=population_size)
hall_of_fame = tools.HallOfFame(1)
objective_runs_ga =0# Evaluate the individuals with an invalid fitness
invalid_ind =[ind for ind in population ifnot ind.fitness.valid]# split the population in a list with n_jobs elements# each element is an array containing multiple individuals# send individuals to the evaluator
fitnesses_nested = toolbox.map_multi(
toolbox.evaluate, n_jobs, np.array_split(invalid_ind, n_jobs))
objective_runs_ga +=len(invalid_ind)
fitnesses =[]for l in fitnesses_nested:
fitnesses += l
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values= fit
hall_of_fame.update(population)
n_gen =1000
best_objective_per_gen_ga = np.full(n_gen, np.nan)
best_objective_ga = np.nan
best_generation_ga =0
gene_values_mean = np.zeros((n_gen,len(X_features)))
gene_maes = np.full(n_gen, np.nan)
time_to_best_ga = np.inf
# Begin the generational process
iterator = tqdm(range(1, n_gen +1),desc='generation')
t_start =time.time()for gen in iterator:
t_start_loop =time.time()# Select the next generation of individuals
offspring = toolbox.select(population)# Vary the pool of individuals via cross-over and mutation
offspring = algorithms.varAnd(
offspring,
toolbox,
crossover_probability,
individual_mutation_probability,)# Evaluate the individuals with an invalid fitness
invalid_ind =[ind for ind in offspring ifnot ind.fitness.valid]# split the population in a list with n_jobs elements# each element is an array containing multiple individuals# send list to the evaluator pool
fitnesses_nested = toolbox.map_multi(
toolbox.evaluate, n_jobs, np.array_split(invalid_ind, n_jobs))
objective_runs_ga +=len(invalid_ind)
fitnesses =[]for l in fitnesses_nested:
fitnesses += l
for ind, fit in zip(invalid_ind, fitnesses):
ind.fitness.values= fit
# Update the hall of fame with the generated individuals
hall_of_fame.update(offspring)# Replace the current population by the offspring
population[:]= offspring
t_end_loop =time.time()# record the mean gene values across the population
gene_values_mean[gen -1, :]= np.array(population).mean(axis=0)if gen >=2:
gene_maes[gen -1]= mean_absolute_error(
gene_values_mean[gen -2, :], gene_values_mean[gen -1, :])# pick best individual for stats recording
best_individual_ga = tools.selBest(population,1)[0]
best_objective_per_gen_ga[gen -1]= best_individual_ga.fitness.values[0]if(
best_objective_ga is np.nan
or fitness_weights * best_individual_ga.fitness.values[0]> fitness_weights * best_objective_ga
):
best_objective_ga = best_individual_ga.fitness.values[0]
best_generation_ga = gen
time_to_best_ga = t_end_loop - t_start
print(
f'gen: {gen:4n}, curr/prev gene MAE: {gene_maes[gen - 1]:.4f}, new best objective: {best_objective_ga:.4f}, time to best: {time_to_best_ga:.4f}')if os.path.isfile('break'):
# to gracefully break the loop, manually create a file called 'break'print(f'Found break file, stopping now.')
iterator.close()break
g_completed_ga = gen
best_individual_ga = list(hall_of_fame[0])
best_features_ga =[
X_features[i]for i, val in enumerate(best_individual_ga)if val ==1]
best_objective_ga =(
sm.OLS(y_ga, X_ga[['const']+ best_features_ga], hascnotallow=True).fit().bic
)## 强烈建议关注#公众号:数据STUDIO 更多优质内容每日更新!
1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.
12.
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
26.
27.
28.
29.
30.
31.
32.
33.
34.
35.
36.
37.
38.
39.
40.
41.
42.
43.
44.
45.
46.
47.
48.
49.
50.
51.
52.
53.
54.
55.
56.
57.
58.
59.
60.
61.
62.
63.
64.
65.
66.
67.
68.
69.
70.
71.
72.
73.
74.
75.
76.
77.
78.
79.
80.
81.
82.
83.
84.
85.
86.
87.
88.
89.
90.
91.
92.
93.
94.
95.
96.
97.
98.
99.
100.
101.
102.
103.
104.
105.
106.
107.
108.
109.
110.
111.
112.
113.
114.
115.
116.
117.
118.
119.
120.
121.
122.
123.
124.
125.
126.
127.
128.
129.
130.
131.
132.
133.
134.
135.
136.
137.
138.
139.
140.
141.
142.
143.
144.
145.
146.
147.
148.
149.
150.
151.
152.
153.
154.
155.
156.
157.
158.
159.
160.
161.
162.
163.
164.
165.
166.
167.
168.
169.
170.
171.
172.
173.
174.
175.
176.
177.
178.
179.
180.
181.
182.
183.
184.
185.
186.
187.
188.
189.
190.
191.
192.
193.
194.
195.
196.
197.
198.
199.
200.
201.
202.
203.
best objective: 33705.569572544795
best generation: 787
objective runs: 600525timeto best: 158.027 sec