CSDN博客导出(C++实现)

浏览数：22 / 时间：2015年06月08日

操作系统：windowAll

编程工具：visual studio 2013

编程语言：VC++

最近博文更新的较频繁，为了防止账号异常引起csdn博文丢失，所以花了点时间做了个小工具来导出博文，用做备份。本文将从源码分析整个实现过程。先看个截图：

操作步骤：

先在博客地址文本框输入博客地址例如：http://blog.csdn.net/yxstars/
然后点击确定，将显示共有多少篇博文，例如：[19:32:47]博文113篇
点击文章列表：将显示所有博文，格式：title，href
点击导出博文：将导出博文，在当前目录下的blog文件夹中。博文格式为html。
遍历博文：将遍历所有博文，每隔3s显示一篇。
停止：是终止遍历博文。

源码分析：

1. 获取对应的url页面源代码，实现如下：

bool CBlogExportDlg::GetUrlStr(CString strUrl, CString& UrlData)
{
	CInternetSession session;
	CHttpFile *file = NULL;
	try{
		file = (CHttpFile*)session.OpenURL(strUrl);
	}
	catch (CInternetException *m_pException){
		file = NULL;
		m_pException->m_dwError;
		m_pException->Delete();
		session.Close();
		ShowMes("网络连接错误...");
		return false;
	}

	if (!file){
		ShowMes(strUrl + "获取失败...");
		return false;
	}

	CString sRecived;
	while (file->ReadString(sRecived) != NULL) {
		UrlData += sRecived + "\n";
	}
	session.Close();
	file->Close();
	delete file; 
	file = NULL;
	return true;
}

2. 获取的html源码为utf8格式，需要转为ansi格式，C++实现代码如下：

int CBlogExportDlg::ConvUtf8ToAnsi(CString& strSource, CString& strChAnsi)
{
	if (strSource.GetLength() <= 0)
		return 0;

	CString strWChUnicode;

	strSource.TrimLeft();
	strSource.TrimRight();
	strChAnsi.Empty();

	int iLenByWChNeed = MultiByteToWideChar(CP_UTF8, 0,
		strSource.GetBuffer(0),
		strSource.GetLength(), //MultiByteToWideChar
		NULL, 0);

	int iLenByWchDone = MultiByteToWideChar(CP_UTF8, 0,
		strSource.GetBuffer(0),
		strSource.GetLength(),
		(LPWSTR)strWChUnicode.GetBuffer(iLenByWChNeed * 2),
		iLenByWChNeed); //MultiByteToWideChar

	strWChUnicode.ReleaseBuffer(iLenByWchDone * 2);

	int iLenByChNeed = WideCharToMultiByte(CP_ACP, 0,
		(LPCWSTR)strWChUnicode.GetBuffer(0),
		iLenByWchDone,
		NULL, 0,
		NULL, NULL);

	int iLenByChDone = WideCharToMultiByte(CP_ACP, 0,
		(LPCWSTR)strWChUnicode.GetBuffer(0),
		iLenByWchDone,
		strChAnsi.GetBuffer(iLenByChNeed),
		iLenByChNeed,
		NULL, NULL);

	strChAnsi.ReleaseBuffer(iLenByChDone);

	if (iLenByWChNeed != iLenByWchDone || iLenByChNeed != iLenByChDone)
		return 1;

	return 0;
}

3. 消息文本框显示

void CBlogExportDlg::ShowMes(CString mes)
{
	CTime time;
	time = CTime::GetCurrentTime();//Get the current time
	CString Times = _T("[") + time.Format("%H:%M:%S") + "]";//Conversion time format

	int len = MesEdit.GetWindowTextLength();
	MesEdit.SetSel(len, len);
	MesEdit.ReplaceSel(Times + mes + _T("\r\n"));
}

4. 点击确定按钮后，实现代码

void CBlogExportDlg::OnBnClickedButtonOk()
{
	GetDlgItemText(IDC_EDIT_ADDRESS, blogAdr);
	ShowBlogAdr();
	//blogAdr = ("http://blog.csdn.net/yxstars/");
	int pos = blogAdr.Find("http://blog.csdn.net/");
	if (pos == -1){
		ShowMes("csdn blog地址不对...");
	}
	blogAdrs = blogAdr;

	CString urlData;
	if (!GetUrlStr(blogAdr, urlData)){
		return;
	}

	CFile fs;
	if (!fs.Open(strDirPath + "temp", CFile::modeCreate | CFile::modeWrite)){
		return;
	}

	fs.Write(urlData, urlData.GetLength());
	fs.Close();

	CString ansiUrlData;
	ConvUtf8ToAnsi(urlData, ansiUrlData);
	GetBlogInfo(ansiUrlData);

}

5. 根据博客地址，获取源代码后分析，查找博文数目，和博文列表页数。

<!--显示分页 -->

<div id="papelist" class="pagelist">
<span> 113条数据  共6页</span><strong>1</strong> <a href="/yxstars/article/list/2">2</a> <a href="/yxstars/article/list/3">3</a> <a href="/yxstars/article/list/4">4</a> <a href="/yxstars/article/list/5">5</a> <a href="/yxstars/article/list/6">...</a> <a href="/yxstars/article/list/2">下一页</a> <a href="/yxstars/article/list/6">尾页</a> 
</div>

从上面的代码中可以获取信息如下：

113条数据共6页, 共有113篇博文，共有6页。

<a href="/yxstars/article/list/3">，页面链接地址为/yxstars/article/list/ + 要显示的页数。

C++代码实现如下：

void CBlogExportDlg::GetBlogInfo(CString& urlData)
{
	int pos = urlData.Find("<div id=\"papelist\" class=\"pagelist\">");
	if (pos == -1){
		ShowMes("获取列表数目失败...");
		return;
	}
	urlData = urlData.Mid(pos + 44);
	pos = urlData.Find("条数据");
	if (pos == -1){
		ShowMes("获取列表条数失败...");
		return;
	}

	CString blogListNum = urlData.Left(pos);
	
	pos = urlData.Find("条数据  共");
	int poss = urlData.Find("页</span>");
	if ((poss == -1) || (pos == -1)){
		ShowMes("获取列表页数失败...");
		return;
	}

	CString listPage = urlData.Mid(pos + 10, poss - pos - 10);
	blogListPage = StrToInt(listPage);
	ShowMes("博文" + blogListNum + "篇");
}

6. 当点击显示列表时，根据之前的页面地址获取信息。

void CBlogExportDlg::OnBnClickedButtonList()
{
	clearMes();
	CString urlData, ansiUrlData, listPage;
	//http://blog.csdn.net/yxstars/article/list/1
	FileListMap.clear();
	listNum = 1;

	for (int i = 1; i < blogListPage + 1; i++){
		urlData.Empty();
		ansiUrlData.Empty();
		listPage.Format("%d", i);
		blogAdr = blogAdrs + "/article/list/" + listPage;
		ShowBlogAdr();
		if (!GetUrlStr(blogAdr, urlData)){
			return;
		}

		ConvUtf8ToAnsi(urlData, ansiUrlData);
		GetFileList(ansiUrlData);
	}

}

7. 在每个页面获取文章列表和页面地址。

    <h1>
        <span class="link_title"><a href="/yxstars/article/details/38469431">
        <font color="red">[置顶]</font>
        金融系列12《双币电子现金方案》
        </a></span>
    </h1>

从上面源码可以看出：

后面就是博文链接地址。

</a>前面的就是博文标题。

如果有置顶操作，会多出这部分[置顶]

C++获取源码实现如下：

void CBlogExportDlg::GetFileList(CString& urlData)
{	
	CString strListNum;
	int posF = urlData.Find("<span class=\"link_title\">");
	while (posF != -1){
		urlData = urlData.Mid(posF + 34);
		int posE = urlData.Find("\"");
		if (posE == -1){
			ShowMes("获取列表失败...");
			return;
		}

		CString href = urlData.Left(posE);
		posF = urlData.Find("</a>");
		if (posF == -1){
			ShowMes("获取列表失败...");
			return;
		}

		CString title = urlData.Mid(posE+2, posF-posE-2);
		posF = title.ReverseFind('>');
		if (posF != -1){
			title = title.Mid(posF + 1);
		}
		title.Trim("\n").Trim();
		href = "http://blog.csdn.net" + href;
		FileListMap[title] = href;
		strListNum.Format("%03d", listNum++);
		strListNum = (strListNum + ":" + title + "                                            ").Left(45);
		ShowMes(strListNum + href);
		posF = urlData.Find("<span class=\"link_title\">");
	}
}

8. 当点击导出博文时，我们只需把源代码保存为html格式即可，采用多线程实现：

void CBlogExportDlg::OnBnClickedButtonExport()
{
	clearMes();
	unsigned tid;
	unsigned long thd = _beginthreadex(NULL, 0, CBlogExportDlg::WriteCycle, this, 0, &tid);
	if (thd != NULL)
	{
		CloseHandle((HANDLE)thd);
	}

}

unsigned __stdcall  CBlogExportDlg::WriteCycle(void* p)
{
	CBlogExportDlg* dlg = (CBlogExportDlg*)p;
	CString blogFolderPath = dlg->strDirPath + "Blog\\";
	if (!PathIsDirectory(blogFolderPath))
	{
		if (!CreateDirectory(blogFolderPath, NULL))
		{
			dlg->ShowMes(blogFolderPath + "创建失败...");
			return 1;
		}
	}
	

	dlg->stopRun = false;
	CString urlData, strList;
	int iList = 1;
	CFile cf;
	std::map<CString, CString>::iterator iter;
	for (iter = dlg->FileListMap.begin(); iter != dlg->FileListMap.end(); iter++){
		//dlg->blogAdr = iter->second;
		//dlg->ShowBlogAdr();
		urlData.Empty();
		if (!dlg->GetUrlStr(iter->second, urlData)){
			return 1;
		}
		strList.Format("%3d", iList++);
		dlg->ShowMes("正在导出第" + strList + "篇博文：" + iter->first);
		CString blogPath(iter->first);
		blogPath.Replace('\\', '_');
		blogPath.Replace('/', '_');
		blogPath = blogFolderPath + blogPath + ".html";
		if (!cf.Open(blogPath, CFile::modeCreate | CFile::modeWrite)){
			dlg->ShowMes("创建文件失败" + blogPath);
			return 2;
		}
		cf.Write(urlData, urlData.GetLength());
		cf.Close();

		if (dlg->stopRun){
			return 1;
		}

	}
	return 0;
}

9. 遍历博文时，只需依次访问之前保存的链接即可，实现如下：

void CBlogExportDlg::OnBnClickedButtonRead()
{
	clearMes();
	unsigned tid;
	unsigned long thd = _beginthreadex(NULL, 0, CBlogExportDlg::ReadCycle, this, 0, &tid);
	if (thd != NULL)
	{
		CloseHandle((HANDLE)thd);
	}
}


unsigned __stdcall  CBlogExportDlg::ReadCycle(void* p)
{
	CBlogExportDlg* dlg = (CBlogExportDlg*)p;
	dlg->stopRun = false;
	std::map<CString, CString>::iterator iter;
	for (iter = dlg->FileListMap.begin(); iter != dlg->FileListMap.end(); iter++){
		dlg->blogAdr = iter->second;
		dlg->ShowBlogAdr();
		dlg->ShowMes("正在遍历博文：" + iter->first);
		Sleep(3000);
		if (dlg->stopRun){
			return 1;
		}


	}
	return 0;
}

CSDN免积分下载地址：http://download.csdn.net/detail/yxstars/7786309

文/闫鑫原创 转载请注明出处http://blog.csdn.net/yxstars/article/details/38686487