2007-08-29
htmlParser解析html文件
关键字: htmlparserjava 代码
- public class HtmlFileIo {
- public static void main (String[] args)
- {
- String strFile = "file://tenwa-98bf4155e/zhanghftemp/office/JMO_34.htm";
- String strDir = "file://tenwa-98bf4155e/zhanghftemp/office";
- try
- {
- //读取一个文件的内容
- //readByHtml(strFile);
- //得到一个文件最后修改时间
- //getRemoteLastModified(strFile);
- ArrayList al = getRemoteDirInfo(strDir);
- //获取一个文件夹下所有文件的名称
- for(int i=0;i<al.size();i++){
- System.out.println(al.get(i));
- String strName = (al.get(i)).toString();
- //获取楼层数
- System.out.println(strName.substring(0, strName.indexOf('.')).split("_")[1]);
- // System.out.println(strDir+"/"+al.get(i));
- //获取楼层中的单元信息
- // ArrayList alArea = readByHtml(strDir+"/"+al.get(i));
- // for(int j=0;j<alArea.size();j++){
- // HashMap hm = (HashMap)alArea.get(j);
- //获取楼层中的单元信息
- // System.out.println("href:"+hm.get("href")+" shape:"+hm.get("shape")+" coords:"+hm.get("coords"));
- // }
- }
- }
- catch (Exception pe)
- {
- pe.printStackTrace ();
- }
- }
- /**
- * 得到指定文件夹下的所有符合规则的文件(本地)
- * */
- public static ArrayList getFileList(String content)throws Exception{
- URI uri = new URI(content);
- ArrayList al = new ArrayList();
- File file = new File(uri);
- System.out.println(file.exists());
- if(file.isDirectory()){
- File[] filelist = file.listFiles();
- for(int i=0;i<filelist.length;i++){
- if(filelist[i].getName().substring(0, 4).equals("JMO_")&&getFileTypeName(filelist[i].getName())){
- al.add(filelist[i]);
- }
- }
- }
- return al;
- }
- public static boolean getFileTypeName(String strFile){
- if(strFile.substring(strFile.indexOf(".")+1, strFile.length()).equals("htm")||strFile.substring(strFile.indexOf(".")+1, strFile.length()).equals("html")){
- return true;
- }else{
- return false;
- }
- }
- /**
- * 读取房态图中的单元号,房间大小,area位置
- * */
- public static ArrayList readByHtml(String content) throws Exception{
- ArrayList alRoom = new ArrayList();
- Parser parser = new Parser();
- parser.setEncoding("8859_1");
- parser.setInputHTML(getWmlContent(content));
- PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();
- factory.registerTag(new AreaTag ());
- parser.setNodeFactory(factory);
- NodeList nlArea = parser.extractAllNodesThatMatch(lnkFilter);
- for(int i=0;i<nlArea.size();i++){
- CompositeTag node = (CompositeTag)nlArea.elementAt(i);
- if(node instanceof AreaTag){
- AreaTag at = (AreaTag)nlArea.elementAt(i);
- HashMap hm = new HashMap();
- hm.put("href", at.getHref());
- hm.put("shape", at.getShape());
- hm.put("coords", at.getCoords());
- alRoom.add(hm);
- }
- }
- return alRoom;
- }
- /**
- * 得到文件中的内容
- * */
- static String getWmlContent(String content) throws Exception{
- StringBuffer wml = new StringBuffer();
- String line = getRemoteInfo(content);
- if(wml.length()>0)
- wml.append("\r\n");
- wml.append(line);
- return wml.toString();
- }
- /**
- * 得到url文件的最后修改时间
- * */
- public static String getRemoteLastModified(String content)throws Exception{
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- URL url = new URL(content);
- java.util.Date dateLast = new java.util.Date(url.openConnection().getLastModified());
- String strMod = sdf.format(dateLast);
- System.out.println(strMod);
- return strMod;
- }
- /**
- * 获取远程html文件的内容
- * */
- public static String getRemoteInfo(String content)throws Exception{
- URL urlfile;
- BufferedReader in;
- String inputLine;
- String info = "";
- try {
- urlfile = new URL(content);
- in = new BufferedReader(new InputStreamReader(urlfile.openStream()));
- inputLine = in.readLine();
- while (inputLine != null) {
- info += inputLine ;
- inputLine = in.readLine();
- }
- in.close();
- } catch (MalformedURLException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e){
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return info;
- }
- /**
- * 获取远程目录中的文件名称
- * */
- public static ArrayList getRemoteDirInfo(String content)throws Exception{
- ArrayList alFile = new ArrayList();
- URL urlfile;
- BufferedReader in;
- String inputLine;
- String info = "";
- try {
- urlfile = new URL(content);
- in = new BufferedReader(new InputStreamReader(urlfile.openStream()));
- inputLine = in.readLine();
- while (inputLine != null) {
- if(inputLine.substring(0,4).equals("JMO_")&&getFileTypeName(inputLine)){
- alFile.add(inputLine);
- }
- inputLine = in.readLine();
- }
- in.close();
- } catch (MalformedURLException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e){
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return alFile;
- }
- static NodeFilter lnkFilter = new NodeFilter() {
- public boolean accept(Node node) {
- if(node instanceof AreaTag)
- return true;
- return false;
- }
- };
- /**
- * 定义area标签,用于查找area信息
- * */
- static class AreaTag extends CompositeTag{
- private static final String[] mIds = new String[] {"area"};
- private static final String[] mEndTagEnders = new String[] {"map"};
- /**
- * Create a new text area tag.
- */
- public AreaTag (){
- }
- /**
- * Return the set of names handled by this tag.
- * @return The names to be matched that create tags of this type.
- */
- public String[] getIds ()
- {
- return (mIds);
- }
- public String[] getEnders (){
- return (mIds);
- }
- public String[] getEndTagEnders (){
- return (mEndTagEnders);
- }
- public String getHref(){
- return super.getAttribute("href");
- }
- public String getCoords(){
- return super.getAttribute("coords");
- }
- public String getShape(){
- return super.getAttribute("shape");
- }
- public String toString(){
- return mIds[0].toString();
- }
- }
- }
评论
jjp2009
2008-08-16
这两句到底读取的是什么啊
jjp2009
2008-08-16
String strFile = "file://tenwa-98bf4155e/zhanghftemp/office/JMO_34.htm";
String strDir = "file://tenwa-98bf4155e/zhanghftemp/office";
楼主这两句话是什么意思啊
String strDir = "file://tenwa-98bf4155e/zhanghftemp/office";
楼主这两句话是什么意思啊
water84222
2008-04-17
请问一下,怎样将修改过得html保存到文件中
code如下
parser = new Parser(getContentByLocalFile(file));
NodeFilter nt = new NodeClassFilter(ImageTag.class) ;
NodeList tmpImageList = (NodeList) parser.parse(nt);
/*linkTmpHash = new Hashtable();
for (int i = 0; i < length; i++) {
Element tmpElement = (Element) tmpNodeList.item(i);
String href = tmpElement.getAttribute("href");
if (href != null && !href.equals("")) {
linkTmpHash.put(href, "");
}
}
data.setHrefs((String[]) linkTmpHash.keySet().toArray(new String[linkTmpHash.size()]));*/
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter (new FileOutputStream (file)));
linkTmpHash = new Hashtable();
for (int i = 0; i < tmpImageList.size(); i++) {
imgnode = (ImageTag)tmpImageList.elementAt(i);
String src = imgnode.getImageURL();
if (URLPathNameUtil.isAbsolutePath(src)) {
if (testAbsolutePath) {
testImagetag(file,src);
}
} else {
if (testRelativePath) {
testImagetag(file, src);
}
}
if(getRealPath()!=null){
imgnode.setImageURL(getRealPath());
writer.write(tmpImageList.toHtml());
}
/*if (src != null && !src.equals("")) {
linkTmpHash.put(src, "");
}*/
}
writer.flush();
writer.close ();
谢谢了
code如下
parser = new Parser(getContentByLocalFile(file));
NodeFilter nt = new NodeClassFilter(ImageTag.class) ;
NodeList tmpImageList = (NodeList) parser.parse(nt);
/*linkTmpHash = new Hashtable();
for (int i = 0; i < length; i++) {
Element tmpElement = (Element) tmpNodeList.item(i);
String href = tmpElement.getAttribute("href");
if (href != null && !href.equals("")) {
linkTmpHash.put(href, "");
}
}
data.setHrefs((String[]) linkTmpHash.keySet().toArray(new String[linkTmpHash.size()]));*/
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter (new FileOutputStream (file)));
linkTmpHash = new Hashtable();
for (int i = 0; i < tmpImageList.size(); i++) {
imgnode = (ImageTag)tmpImageList.elementAt(i);
String src = imgnode.getImageURL();
if (URLPathNameUtil.isAbsolutePath(src)) {
if (testAbsolutePath) {
testImagetag(file,src);
}
} else {
if (testRelativePath) {
testImagetag(file, src);
}
}
if(getRealPath()!=null){
imgnode.setImageURL(getRealPath());
writer.write(tmpImageList.toHtml());
}
/*if (src != null && !src.equals("")) {
linkTmpHash.put(src, "");
}*/
}
writer.flush();
writer.close ();
谢谢了
yongtree
2007-10-17
顶!
好东东哦。
好东东哦。
发表评论
- 浏览: 3605 次
- 性别:

- 来自: 青岛

- 详细资料
搜索本博客
最近加入圈子
最新评论
-
htmlParser解析html文件
这两句到底读取的是什么啊
-- by jjp2009 -
htmlParser解析html文件
String strFile = "file://tenwa-98bf4155e ...
-- by jjp2009 -
htmlParser解析html文件
请问一下,怎样将修改过得html保存到文件中code如下parser = new ...
-- by water84222 -
htmlParser解析html文件
顶! 好东东哦。
-- by yongtree -
SAIF:Struts的AOP
Ivan Li 写道我如果花时间去研究这个,不如直接升级到struts2 谢 ...
-- by cskysnew






评论排行榜