logistic回归 c++ 实现
logistic回归是统计学习中经典的分类方法,他属于对数线性模型。本博文主要给出logistic的c++实现,具体理论请读者自行google。
本文用到的数据集来自于一个医学网站,具体出处不记得了(非常歉意)。数据的格式如下:
10009 1 0 0 1 0 1
10025 0 0 1 2 0
0
20035 0 0 1 0 0 1
20053 1 0 0 0 0 0
30627 1 0 1 2 0
0
30648 2 0 0 0 1 0
每行有7个列值,第一列是一个ID号,在具体操作中,忽略该列。之后的5列,每一个都表示一个特征的取值;最后一列是分类标记(0或1)。
在具体实现时,将分隔数据为训练数据和测试数据,并保存到文件中,文件组织形式如下:
其中testdata.txt,保存测试数据;traindata.txt保存训练数据;logistic.cpp是代码源文件。三个文件保存在同一目录下。
实现代码如下:
/********* logistic回归(c++) by 姜富春 **********/ #include<iostream> #include<fstream> #include<vector> #include<sstream> #include<cmath> using
namespace
std; struct
Data{ vector< int > features; int
cls; Data(vector< int > f, int
c):features(f),cls(c){ } }; struct
Param{ vector< double > w; double
d; Param(vector< double > w1, double
d1):w(w1),d(d1){}; Param():w(vector< double >()),d(0.0){} }; class
Logistic{ public : Logistic(){ //载入traindata文件构造dataSet; loadDataSet(dataSet); //初始化Param,w的长度与数据特征的长度相同,初值为0.0。d的初值也为0.0 vector< double > pw(dataSet[0].features.size(),0.0); Param pt(pw,0.0); param=pt; }; void
loadDataSet(vector<Data>& ds,string dataFile= "./traindata.txt" ){ ifstream fin(dataFile.c_str()); if (!fin){ cout<< "文件打开失败" <<endl; exit (0); } while (fin){ string line; getline(fin,line); if (line.size()>3){ stringstream sin (line); int
t; sin >>t; vector< int > fea; while ( sin ){ char
c= sin .peek(); if ( int (c)!=-1){ sin >>t; fea.push_back(t); } } int
cl=fea.back(); fea.pop_back(); ds.push_back(Data(fea,cl)); } } } void
displayDataSet(){ for ( int
i=0;i<dataSet.size();i++){ for ( int
j=0;j<dataSet[i].features.size();j++){ cout<<dataSet[i].features[j]<< " " ; } cout<< " 分类:" <<dataSet[i].cls; cout<<endl; } } void
logisticRegression(){ //由目标函数为最大似然,因此最终求得的是目标函数的最大值, //因此迭代过程是梯度上升,而非梯度下降 double
lamda=0.1; //梯度下降的步长 double
delta=0.0001; //结束迭代的阈值 //目标函数的值 double
objLw=Lw(param); //cout<<objLw<<endl; Param tpa(param.w,param.d); gradient(lamda); double
newObjLw=Lw(param); int
iter=0; cout<< "初始:" <<endl; displayIterProcess(iter,objLw,newObjLw,1); while ( fabs (newObjLw-objLw)>delta||!samewb(tpa,param,delta)){ objLw=newObjLw; tpa=Param(param.w,param.d); gradient(lamda); newObjLw=Lw(param); ++iter; displayIterProcess(iter,objLw,newObjLw,5); } cout<< "迭代结束共迭代" <<iter<< "步" <<endl; displayIterProcess(iter,objLw,newObjLw,1); } bool
samewb( const
Param &tparam, const
Param& param, double
delta){ for ( int
i=0;i<tparam.w.size();i++){ if ( fabs (tparam.w[i]-param.w[i])>delta){ return
false ; } } if ( fabs (tparam.d-param.d)>delta){ return
false ; } return
true ; } void
displayIterProcess( int
iter, double
objLw, double
newObjLw, int
mod){ //每mod步打印一次迭代过程 if (iter%mod==0){ cout<< "迭代" <<iter<< ":目标函数值【" <<newObjLw<< "】,两次迭代目标函数差值【 " <<(newObjLw-objLw)<< "】" <<endl; cout<< "模型参数:" ; for ( int
i=0;i<param.w.size();i++){ cout<<param.w[i]<< " " ; } cout<<param.d<<endl<<endl; } } //梯度上升更新w和b void
gradient( double
lam){ for ( int
i=0;i<param.w.size();i++){ double
tmp=0.0L; //保存梯度上升过程的中间值 for ( int
j=0;j<dataSet.size();j++){ tmp+=(dataSet[j].cls-logiFun(param,dataSet[j]))*dataSet[j].features[i]*lam; } param.w[i]+=(tmp); } double
tmp=0.0L; for ( int
j=0;j<dataSet.size();j++){ tmp+=(dataSet[j].cls-logiFun(param,dataSet[j]))*lam; } param.d+=tmp; } //计算logistic函数的值,即f(x)=exp(wx)/(1+exp(wx)),该表达式在求解梯度过程中出现, //因此计算这个值是为了辅助梯度上升计算过程 inline
double
logiFun( const
Param &p, const
Data &d){ double
inner=innerWX(p,d); double
le= exp (inner)/(1+ exp (inner)); return
le; } //计算对数似然函数的值 double
Lw(Param p){ double
l=0.0L; for ( int
i=0;i<dataSet.size();i++){ double
inner=innerWX(p,dataSet[i]); l+=(dataSet[i].cls*inner-( log10 (1+ exp (inner)))); //cout<<"l="<<l<<endl; } return
l; } //计算wx+b的值 inline
double
innerWX( const
Param &p, const
Data &data){ if (p.w.size()!=data.features.size()){ cout<< "参数与实例的维度不匹配,不能进行内积计算" <<endl; exit (0); } double
innerP=0.0L; for ( int
i=0;i<p.w.size();i++){ innerP+=(p.w[i]*data.features[i]); } innerP+=p.d; return
innerP; } //给定测试集,预测分类 void
predictClass(){ vector<Data> testDataSet; loadDataSet(testDataSet, "./testdata.txt" ); /******************* 分别计算 P(Y=1|x)=exp(w.x)/(1+exp(w.x)) 和 P(Y=0|x)=1/(1+exp(w.x)) 然后取值大的作为x的分类 *******************/ cout<<endl<< "预测分类:" <<endl; for ( int
i=0;i<testDataSet.size();i++){ double
py1=0.0L; double
py0=0.0L; double
inner=innerWX(param,testDataSet[i]); py1= exp (inner)/(1+ exp (inner)); py0=1-py1; cout<< "实例: " ; for ( int
j=0;j<testDataSet[i].features.size();j++){ cout<<testDataSet[i].features[j]<< " " ; } cout<< "标记分类【" <<testDataSet[i].cls<< "】," ; if (py1>=py0){ cout<< "预测分类【" <<1<< "】" <<endl; } else { cout<< "预测分类【" <<0<< "】" <<endl; } } } private : vector<Data> dataSet; Param param; }; int
main(){ Logistic logist; //logist.displayDataSet(); logist.logisticRegression(); logist.predictClass(); system ( "pause" ); return
0; } |
程序运行结果如下:
- 迭代训练模型过程如下
2. 测试分类过程如下:
本例中分类测试的效果并不是太好,在调试的时候,看到这样的结果我也仔细地审查代码,并未发现程序的错误(也希望读者帮忙审查)。考虑可能的原因:
(1)数据太少
(2)数据特征不能提供足够信息
(3)线性模型,无法很好地划分数据
附件:
训练数据:traindata.txt
10009 1 0 0 1 0 1
10025 0 0 1 2 0 0
10038 1 0 0 1 1 0
10042 0 0 0
0 1 0
10049 0 0 1 0 0 0
10113 0 0 1 0 1 0
10131 0 0 1 2 1 0
10160
1 0 0 0 0 0
10164 0 0 1 0 1 0
10189 1 0 1 0 0 0
10215 0 0 1 0 1
0
10216 0 0 1 0 0 0
10235 0 0 1 0 1 0
10270 1 0 0 1 0 0
10282 1
0 0 0 1 0
10303 2 0 0 0 1 0
10346 1 0 0 2 1 0
10380 2 0 0 0 1
0
10429 2 0 1 0 0 0
10441 0 0 1 0 1 0
10443 0 0 1 2 0 0
10463 0
0 0 0 0 0
10475 0 0 1 0 1 0
10489 1 0 1 0 1 1
10518 0 0 1 2 1
0
10529 1 0 1 0 0 0
10545 0 0 1 0 0 0
10546 0 0 0 2 0 0
10575 1
0 0 0 1 0
10579 2 0 1 0 0 0
10581 2 0 1 1 1 0
10600 1 0 1 1 0
0
10627 1 0 1 2 0 0
10653 1 0 0 1 1 0
10664 0 0 0 0 1 0
10691 1
1 0 0 1 0
10692 1 0 1 2 1 0
10711 0 0 0 0 1 0
10714 0 0 1 0 0
0
10739 1 0 1 1 1 0
10750 1 0 1 0 1 0
10764 2 0 1 2 0 0
10770 0
0 1 2 1 0
10780 0 0 1 0 1 0
10784 2 0 1 0 1 0
10785 0 0 1 0 1
0
10788 1 0 0 0 0 0
10815 1 0 0 0 1 0
10816 0 0 0 0 1 0
10818 0
0 1 2 1 0
11095 0 1 1 0 0 0
11146 0 1 0 0 1 0
11206 2 1 0 0 0
0
11223 2 1 0 0 0 0
11236 1 1 0 2 0 0
11244 1 1 0 0 0 1
11245 0
1 0 0 0 0
11278 2 1 0 0 1 0
11322 0 1 0 0 1 0
11326 2 1 0 2 1
0
11329 2 1 0 2 1 0
11344 1 1 0 2 1 0
11358 0 1 0 0 0 1
11417 2
1 1 0 1 0
11421 2 1 0 1 1 0
11484 1 1 0 0 0 1
11499 2 1 0 0 0
0
11503 1 1 0 0 1 0
11527 1 1 0 0 0 0
11540 2 1 0 1 1 0
11580 1
1 0 0 1 0
11583 1 0 1 1 0 1
11592 2 1 0 1 1 0
11604 0 1 0 0 1
0
11625 1 0 1 0 0 0
20035 0 0 1 0 0 1
20053 1 0 0 0 0 0
20070 0
0 0 2 1 0
20074 1 0 1 2 0 1
20146 1 0 0 1 1 0
20149 2 0 1 2 1
0
20158 2 0 0 0 1 0
20185 1 0 0 1 1 0
20193 1 0 1 0 1 0
20194 0
0 1 0 0 0
20205 1 0 0 2 1 0
20206 2 0 1 1 1 0
20265 0 0 1 0 1
0
20311 0 0 0 0 1 0
20328 2 0 0 1 0 1
20353 0 0 1 0 0 0
20372 0
0 0 0 0 0
20405 1 0 1 1 1 1
20413 2 0 1 0 1 0
20427 0 0 0 0 0
0
20455 1 0 1 0 1 0
20462 0 0 0 0 1 0
20472 0 0 0 2 0 0
20485 0
0 0 0 0 0
20523 0 0 1 2 0 0
20539 0 0 1 0 1 0
20554 0 0 1 0 0
1
20565 0 0 0 2 1 0
20566 1 0 1 1 1 0
20567 1 0 0 1 1 0
20568 0
0 1 0 1 0
20569 1 0 0 0 0 0
20571 1 0 1 0 1 0
20581 2 0 0 0 1
0
20583 1 0 0 0 1 0
20585 2 0 0 1 1 0
20586 0 0 1 2 1 0
20591 1
0 1 2 0 0
20595 0 0 1 2 1 0
20597 1 0 0 0 0 0
20599 0 0 1 0 1
0
20607 0 0 0 1 1 0
20611 1 0 0 0 1 0
20612 2 0 0 1 1 0
20614 1
0 0 1 1 0
20615 1 0 1 0 0 0
21017 1 1 0 1 1 0
21058 2 1 0 0 1
0
21063 0 1 0 0 0 0
21084 1 1 0 1 0 1
21087 1 1 0 2 1 0
21098 0
1 0 0 0 0
21099 1 1 0 2 0 0
21113 0 1 0 0 1 0
21114 1 1 0 0 1
1
21116 1 1 0 2 1 0
21117 1 0 0 2 1 0
21138 2 1 1 1 1 0
21154 0
1 0 0 1 0
21165 0 1 0 0 1 0
21181 2 1 0 0 0 1
21183 1 1 0 2 1
0
21231 1 1 0 0 1 0
21234 1 1 1 0 0 0
21286 2 1 0 2 1 0
21352 2
1 1 1 0 0
21395 0 1 0 0 1 0
21417 1 1 0 2 1 0
21423 0 1 0 0 1
0
21426 1 1 0 1 1 0
21433 0 1 0 0 1 0
21435 0 1 0 0 0 0
21436 1
1 0 0 0 0
21439 1 1 0 2 1 0
21446 1 1 0 0 0 0
21448 0 1 1 2 0
0
21453 2 1 0 0 1 0
30042 2 0 1 0 0 1
30080 0 0 1 0 1 0
301003 1
0 1 0 0 0
301009 0 0 1 2 1 0
301017 0 0 1 0 0 0
30154 1 0 1 0 1
0
30176 0 0 1 0 1 0
30210 0 0 1 0 1 0
30239 1 0 1 0 1 0
30311 0
0 0 0 0 1
30382 0 0 1 2 1 0
30387 0 0 1 0 1 0
30415 0 0 1 0 1
0
30428 0 0 1 0 0 0
30479 0 0 1 0 0 1
30485 0 0 1 2 1 0
30493 2
0 1 2 1 0
30519 0 0 1 0 1 0
30532 0 0 1 0 1 0
30541 0 0 1 0 1
0
30567 1 0 0 0 0 0
30569 2 0 1 1 1 0
30578 0 0 1 0 0 1
30579 1
0 1 0 0 0
30596 1 0 1 1 1 0
30597 1 0 1 1 0 0
30618 0 0 1 0 0
0
30622 1 0 1 1 1 0
30627 1 0 1 2 0 0
30648 2 0 0 0 1 0
30655 0
0 1 0 0 1
30658 0 0 1 0 1 0
30667 0 0 1 0 1 0
30678 1 0 1 0 0
0
30701 0 0 1 0 0 0
30703 2 0 1 1 0 0
30710 0 0 1 2 0 0
30713 1
0 0 1 1 1
30716 0 0 0 0 1 0
30721 0 0 0 0 0 1
30723 0 0 1 0 1
0
30724 2 0 1 2 1 0
30733 1 0 0 1 0 0
30734 0 0 1 0 0 0
30736 2
0 0 1 1 1
30737 0 0 1 0 0 0
30740 0 0 1 0 1 0
30742 2 0 1 0 1
0
30743 0 0 1 0 1 0
30745 2 0 0 0 1 0
30754 1 0 1 0 1 0
30758 1
0 0 0 1 0
30764 0 0 1 0 0 1
30765 2 0 0 0 0 0
30769 2 0 0 1 1
0
30772 0 0 1 0 1 0
30774 0 0 0 0 1 0
30784 2 0 1 0 0 0
30786 1
0 1 0 1 0
30787 0 0 0 0 1 0
30789 1 0 1 0 1 0
30800 0 0 1 0 0
0
30801 1 0 1 0 1 0
30803 1 0 1 0 1 0
30806 1 0 1 0 1 0
30817 0
0 1 2 0 0
30819 2 0 1 0 1 1
30822 0 0 1 0 1 0
30823 0 0 1 2 1
0
30834 0 0 0 0 0 0
30836 0 0 1 0 1 0
30837 1 0 1 0 1 0
30840 0
0 1 0 1 0
30841 1 0 1 0 0 0
30844 0 0 1 0 1 0
30845 0 0 1 0 0
0
30847 1 0 1 0 0 0
30848 0 0 1 0 1 0
30850 0 0 1 0 1 0
30856 1
0 0 0 1 0
30858 0 0 1 0 0 0
30860 0 0 0 0 1 0
30862 1 0 1 1 1
0
30864 0 0 0 2 0 0
30867 0 0 1 0 1 0
30869 0 0 1 0 1 0
30887 0
0 1 0 1 0
30900 1 0 0 1 1 0
30913 2 0 0 0 1 0
30914 1 0 0 0 0
0
30922 2 0 0 2 1 0
30923 0 0 1 2 1 0
30927 1 0 1 0 0 1
30929 0
0 1 2 1 0
30933 0 0 1 2 1 0
30940 0 0 1 0 1 0
30943 1 0 1 2 1
0
30945 0 0 0 2 0 0
30951 1 0 0 0 0 0
30964 0 0 0 2 1 0
30969 0
0 1 0 1 0
30979 2 0 0 0 1 0
30980 1 0 0 0 0 0
30982 1 0 0 1 1
0
30990 1 0 1 1 1 0
30991 1 0 1 0 1 1
30999 0 0 1 0 1 0
31056 1
1 0 2 1 0
31068 1 1 0 1 0 0
31108 2 1 0 2 1 0
31168 1 1 1 0 0
0
31191 0 1 1 0 0 0
31229 0 1 1 0 0 1
31263 0 1 0 0 1 0
31281 1
1 1 0 0 0
31340 1 1 1 0 1 0
31375 0 1 0 0 1 0
31401 0 1 1 0 0
1
31480 1 1 1 1 1 0
31501 1 1 0 2 1 0
31514 0 1 0 2 0 0
31518 1
1 0 2 1 0
31532 0 0 1 2 1 0
31543 2 1 1 1 1 0
31588 0 1 0 0 1
0
31590 0 0 1 0 1 0
31591 2 1 0 1 1 0
31595 0 1 0 0 1 0
31596 1
1 0 0 0 0
31598 1 1 0 0 1 0
31599 0 1 0 0 0 0
31605 0 1 1 0 0
0
31612 2 1 0 0 1 0
31615 2 1 0 0 0 0
31628 1 1 0 0 1 0
31640 2
1 0 1 1 0
测试数据:testdata.txt
10009 1 0 0 1 0 1
10025 0 0 1 2 0 0
20035 0 0 1 0 0 1
20053 1 0 0
0 0 0
30627 1 0 1 2 0 0
30648 2 0 0 0 1 0
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。