使用NekoHtml处理网页(删除Style标签)

最近在做一个手机APP,通过一个新闻抓取程序抓取新闻,然后通过APP展示新闻。后发现手机端不支持Style标签,如果网页中有Style标签,则标签内的内容会显示处理非常影响页面美观。于是就写了一个用NekoHTML来清除Style标签的工具类


html.filter.properties 配置文件,配置允许的标签和要删除的标签及标签内的属性

attributes=style,id,name,class,width,height,src,oldsrc,complete,align,alt,title
acceptTags=div,span,a,li,ul,nav,br,p,img,font,b,strong,table,tr,td
removeTags=style

PropertiesUtils 读取Properties

package com.tiamaes.gjds.util;

import java.io.IOException;
import java.util.Properties;

import org.springframework.core.io.ClassPathResource;

/**  
 * <p>类描述: 读取Properties中的属性 </p>
 * <p>创建人:王成委  </p>
 * <p>创建时间:2015年1月28日 上午11:23:27  </p>
 * <p>版权说明: © 2015 Tiamaes </p>
 */
public class PropertiesUtils {
	private Properties properties;
	
	public PropertiesUtils(String path){
		try {
			ClassPathResource resource = new ClassPathResource(path);
			properties = new Properties();
			properties.load(resource.getInputStream());
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	public String get(String key){
		return this.properties.getProperty(key);
	}

}

过滤HTML中的标签

package com.tiamaes.gjds.util;

import java.io.CharArrayReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;

import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.cyberneko.html.filters.ElementRemover;
import org.cyberneko.html.filters.Writer;
import org.cyberneko.html.parsers.DOMParser;
import org.xml.sax.InputSource;

/**  
 * <p>类描述: 过滤Html中的标签 </p>
 * <p>创建人:王成委  </p>
 * <p>创建时间:2015年1月29日 上午10:45:02  </p>
 * <p>版权说明: © 2015 Tiamaes </p>
 */
public class HtmlFilterUtils {
	private static PropertiesUtils properties = null;
	private static HtmlFilterUtils filter = null;
	private String configPath = "html.filter.properties";
	private static final String ATTRIBUTE_FIELD = "attributes";
	private static final String ACCEPT_TAGS_FIELD = "acceptTags";
	private static final String REMOVE_TAGS_FIELD = "removeTags";
	
	private List<String> attributes = new ArrayList<String>();
	private List<String> acceptTags = new ArrayList<String>();
	private List<String> removeTags = new ArrayList<String>();
	
	private static synchronized void syncInit(){
		if(filter == null)
			filter = new HtmlFilterUtils();
	}
	
	public static HtmlFilterUtils getInstance(){
		return getInstance(false);
	}
	
	public static HtmlFilterUtils getInstance(boolean createNew){
		if(createNew)return new HtmlFilterUtils();
		if(filter == null){
			syncInit();
		}
		return filter;
	}
	
	private HtmlFilterUtils(){
		if(properties == null){
			properties = new PropertiesUtils(configPath);
		}
		this.addToList(attributes, properties.get(ATTRIBUTE_FIELD));
		this.addToList(acceptTags, properties.get(ACCEPT_TAGS_FIELD));
		this.addToList(removeTags, properties.get(REMOVE_TAGS_FIELD));
	}
	
	public void addAtributes(String attrName){
		this.attributes.add(attrName);
	}
	
	public void removeAtributes(String attrName){
		this.attributes.remove(attrName);
	}
	
	public void addRmoveTag(String tagName){
		this.removeTags.add(tagName);
	}
	public void removeRmoveTag(String tagName){
		this.removeTags.remove(tagName);
	}
	
	public void addAcceptTag(String tagName){
		this.acceptTags.add(tagName);
	}
	public void removeAcceptTag(String tagName){
		this.acceptTags.remove(tagName);
	}
	
	private void addToList(List<String> list,String sources){
		if(list == null) list = new ArrayList<String>();
		String[] sourcesArray = sources.split(",");
		for(String str:sourcesArray){
			list.add(str);
		}
	}
	
	public String doFilter(String htmlCode){
		ElementRemover remover = new ElementRemover();
		String[] atrrs = new String[attributes.size()];
		for(String tag : acceptTags)remover.acceptElement(tag,attributes.toArray(atrrs));
		for(String tag : removeTags)remover.removeElement(tag);
		
		CharArrayReader reader = null;
		String result;
		try {
			StringWriter filteredDescription = new StringWriter();
			Writer writer = new Writer(filteredDescription,"UTF-8");
			XMLDocumentFilter[] filters = {remover,writer};
			
			DOMParser parser = new DOMParser();
			reader = new CharArrayReader(htmlCode.toCharArray());
			InputSource inputSource = new InputSource(reader);
			parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
			parser.parse(inputSource);
			result = filteredDescription.toString();
		} catch (Exception e1) {
			e1.printStackTrace();
			result =  htmlCode;
		}
		
		try {
			reader.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
		
		return result;
	}
}
调用doFilter可以过滤HTML的内容


郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。