由于原算法较难应用并行策略,而它的另一个算法变种-pegasos 适合并行,下面是该算法的过程。
初始化W=0(向量形式)
for i in t:
随机选择K个样本
for j in K:
if 第j个样本分错
利用该样本更新权重
累加W的更新
end for
下面是基于mrjob的map-reduce版
- class MRsvm(MRJob):
- DEFAULT_INPUT_PROTOCOL = 'json_value'
- #一些参数的设置
- def __init__(self, *args, **kwargs):
- super(MRsvm, self).__init__(*args, **kwargs)
- self.data = pickle.load(open('data_path'))
- self.w = 0
- self.eta = 0.69 #学习率
- self.dataList = [] #用于收集样本的列表
- self.k = self.options.batchsize
- self.numMappers = 1
- self.t = 1 # 迭代次数
- def map(self, mapperId, inVals):
- #<key,value> 对应着 <机器mapperID,W值或者样本特征跟标签>
- if False: yield
- #判断value是属于W还是样本ID
- if inVals[0]=='w':
- self.w = inVals[1]
- elif inVals[0]=='x':
- self.dataList.append(inVals[1])
- elif inVals[0]=='t': self.t = inVals[1]
- def map_fin(self):
- labels = self.data[:,-1]; X=self.data[:,0:-1]#解析样本数据
- if self.w == 0: self.w = [0.001]*shape(X)[1] #初始化W
- for index in self.dataList:
- p = mat(self.w)*X[index,:].T #分类该样本
- if labels[index]*p < 1.0:
- yield (1, ['u', index])#这是错分样本id,记录该样本的id
- yield (1, ['w', self.w]) #map输出该worker的w
- yield (1, ['t', self.t])
- def reduce(self, _, packedVals):
- for valArr in packedVals: #解析数据,错分样本ID,W,迭代次数
- if valArr[0]=='u': self.dataList.append(valArr[1])
- elif valArr[0]=='w': self.w = valArr[1]
- elif valArr[0]=='t': self.t = valArr[1]
- labels = self.data[:,-1]; X=self.data[:,0:-1]
- wMat = mat(self.w); wDelta = mat(zeros(len(self.w)))
- for index in self.dataList:
- wDelta += float(labels[index])*X[index,:] #更新W
- eta = 1.0/(2.0*self.t) #更新学习速率
- #累加对W的更新
- wMat = (1.0 - 1.0/self.t)*wMat + (eta/self.k)*wDelta
- for mapperNum in range(1,self.numMappers+1):
- yield (mapperNum, ['w', wMat.tolist()[0] ])
- if self.t < self.options.iterations:
- yield (mapperNum, ['t', self.t+1])
- for j in range(self.k/self.numMappers):
- yield (mapperNum, ['x', random.randint(shape(self.data)[0]) ])
- def steps(self):
- return ([self.mr(mapper=self.map, reducer=self.reduce,
- mapper_final=self.map_fin)]*self.options.iterations)