httpClient如何接收格式错误的响应头部信息

Exception in thread "main" org.apache.commons.httpclient.ProtocolException: Unable to parse header: share memory not exist, need create new share memory!
at org.apache.commons.httpclient.HttpParser.parseHeaders(HttpParser.java:202)
at org.apache.commons.httpclient.HttpMethodBase.readResponseHeaders(HttpMethodBase.java:1935)
at org.apache.commons.httpclient.HttpMethodBase.readResponse(HttpMethodBase.java:1737)
at org.apache.commons.httpclient.HttpMethodBase.execute(HttpMethodBase.java:1098)
at org.apache.commons.httpclient.HttpMethodDirector.executeWithRetry(HttpMethodDirector.java:398)
at org.apache.commons.httpclient.HttpMethodDirector.executeMethod(HttpMethodDirector.java:171)
at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:397)
at org.apache.commons.httpclient.HttpClient.executeMethod(HttpClient.java:323)

做网页爬虫的时候, 模拟Get请求, 访问网址, 显示上面的错误异常。 问了3个前辈,都说没遇到过, 这可头疼屎了。

主要也不知道是什么问题, 有人说可能是buffer,把大小设置大点。花了一个晚上查网上资料, 有关终于有点眉目, 见参考网址 : http://bbs.csdn.net/topics/390178589

/**
 * 
 */
package com.http;

import java.io.IOException;

import org.apache.http.Header;
import org.apache.http.HttpException;
import org.apache.http.HttpResponse;
import org.apache.http.HttpResponseFactory;
import org.apache.http.HttpVersion;
import org.apache.http.conn.ClientConnectionOperator;
import org.apache.http.conn.OperatedClientConnection;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.impl.conn.BasicClientConnectionManager;
import org.apache.http.impl.conn.DefaultClientConnection;
import org.apache.http.impl.conn.DefaultClientConnectionOperator;
import org.apache.http.impl.conn.DefaultHttpResponseParser;
import org.apache.http.io.HttpMessageParser;
import org.apache.http.io.SessionInputBuffer;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicHttpResponse;
import org.apache.http.message.BasicLineParser;
import org.apache.http.message.BasicStatusLine;
import org.apache.http.message.LineParser;
import org.apache.http.params.HttpParams;
import org.apache.http.util.CharArrayBuffer;
/**
 * @author yingzi
 *
 */
public class MyBasicClientConnectionManager extends BasicClientConnectionManager {

    public MyBasicClientConnectionManager() {
        super();
    }
    
    @Override
    protected ClientConnectionOperator createConnectionOperator( final SchemeRegistry sr) {
        return new MyClientConnectionOperator(sr);
    }
    

    
    class MyClientConnection extends DefaultClientConnection {
        @Override
        protected HttpMessageParser createResponseParser(
                final SessionInputBuffer buffer,
                final HttpResponseFactory responseFactory,
                final HttpParams params) {
            return new MyDefaultHttpResponseParser(buffer, new MyLineParser(),
                    responseFactory, params);
        }
    }
    
    class MyDefaultHttpResponseParser extends DefaultHttpResponseParser {
        public MyDefaultHttpResponseParser(SessionInputBuffer buffer,
                LineParser parser, HttpResponseFactory responseFactory,
                HttpParams params) {
            super(buffer, parser, responseFactory, params);
        }
        @Override
        protected HttpResponse parseHead(
            final SessionInputBuffer sessionBuffer) throws IOException, HttpException {
            try {
                return super.parseHead(sessionBuffer);
            } catch (Exception ex) {
                // 压制ParseException异常
                return new BasicHttpResponse(new BasicStatusLine(HttpVersion.HTTP_1_1, 200, ""));
            }
        }
    }
    
    class MyClientConnectionOperator extends DefaultClientConnectionOperator {
        public MyClientConnectionOperator(final SchemeRegistry sr) {
            super(sr);
        }
    
        @Override
        public OperatedClientConnection createConnection() {
            return new MyClientConnection();
        }
    }
    
    class MyLineParser extends BasicLineParser {
        @Override
        public Header parseHeader(final CharArrayBuffer buffer) {
            try {
                return super.parseHeader(buffer);
            } catch (Exception ex) {
                // 压制ParseException异常
                return new BasicHeader("invalid", buffer.toString());
            }
        }
    }
}
MyBasicClientConnectionManager

 

而我用的是MultiThreadedHttpConnectionManager, 不怎么试用, 于是我看日志的异常trac, 查看了httpClient的具体的excute方法里的代码,瞎鸡巴猜想了下, 觉得要把原来的GetMethod改掉用

/**
 * 
 */
package com.http;

import java.io.IOException;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpConnection;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.HttpState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;



/**
 * @author yingzi
 *
 */
public class MyHttpGetMethod extends org.apache.commons.httpclient.methods.GetMethod {
    private static final Logger log = LoggerFactory.getLogger( MyHttpGetMethod.class );
    public MyHttpGetMethod(){
        super();
    }
    

    public MyHttpGetMethod(String url){
        super(url);
    }
    
    @Override
    protected void readResponseHeaders(HttpState state, HttpConnection conn)
         throws IOException, HttpException {
             getResponseHeaderGroup().clear();
             Header[] headers = {new Header("Connection","Keep-Alive"), new Header("Content-Type","text/html; charset=GB18030"), new Header("Keep-Alive","timeout=20"), new Header("Cache-control","max-age=3600")};
             try {
                     headers = HttpParser.parseHeaders(
                             conn.getResponseInputStream(), getParams().getHttpElementCharset());
                } catch (Exception ex) {
                    // 压制ParseException异常
                    log.warn("response header has some error info , can not parse normally.");
                }
             
             // Wire logging moved to HttpParser
             getResponseHeaderGroup().setHeaders(headers);
    }
    
}

 

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。