[poi-tl]转换html内容到word

    利用jsoup将html解析,递归解析,将不同的html标签内容转换成poi支持的内容。代码仅供参考。

    完整代码示例请参见:
    https://gitee.com/xuwangcheng/poi-tl-html-to-word

    package yi.master.demo;
    
    import cn.hutool.core.collection.CollUtil;
    
    import cn.hutool.core.io.FileUtil;
    
    import cn.hutool.core.util.ReUtil;
    
    import com.deepoove.poi.NiceXWPFDocument;
    
    import com.deepoove.poi.XWPFTemplate;
    
    import com.deepoove.poi.config.Configure;
    
    import com.deepoove.poi.data.MiniTableRenderData;
    
    import com.deepoove.poi.data.PictureRenderData;
    
    import com.deepoove.poi.data.TextRenderData;
    
    import com.deepoove.poi.policy.AbstractRenderPolicy;
    
    import com.deepoove.poi.policy.PictureRenderPolicy;
    
    import com.deepoove.poi.policy.TextRenderPolicy;
    
    import com.deepoove.poi.render.RenderContext;
    
    import com.deepoove.poi.template.run.RunTemplate;
    
    import com.deepoove.poi.util.TableTools;
    
    import org.apache.commons.lang3.StringUtils;
    
    import org.apache.poi.xwpf.usermodel.*;
    
    import org.jsoup.Jsoup;
    
    import org.jsoup.nodes.Element;
    
    import org.jsoup.nodes.Node;
    
    import org.jsoup.select.Elements;
    
    import org.openxmlformats.schemas.wordprocessingml.x2006.main.STMerge;
    
    import javax.imageio.ImageIO;
    
    import java.awt.image.BufferedImage;
    
    import java.io.File;
    
    import java.io.IOException;
    
    import java.io.InputStream;
    
    import java.util.*;
    
    /**
    
     * @author xuwangcheng
    
     * @version 1.0.0
    
     * @description
    
     * @date 2019/11/21 9:31
    
     */
    
    public class Demo {
    
        public static void main(String[] args) throws IOException {
    
            String html = FileUtil.readString("demo.html", "utf-8");
    
            //配置
    
            Configure config = Configure.newBuilder().build();
    
            config.customPolicy("resultHtml", createHtmlRenderPolicy());
    
            //创建word模板对象
    
            Map<String, Object> map = new HashMap<String, Object>();
    
            map.put("top", "TOPPPPP");
    
            map.put("resultHtml", html);
    
            map.put("buttom", "buttommmmmmmmmmmmmm");
    
            XWPFTemplate template = XWPFTemplate.compile(getResourceInputStream("/out_template.docx"), config).render(map);
    
            template.writeToFile("D:\\demo.docx");
    
            template.close();
    
        }
    
        /**
    
     * 获取资源文件的文件流
    
     *
    
     * @return
    
     */
    
        public static InputStream getResourceInputStream(String filePath) {
    
            InputStream in = FileUtil.class.getResourceAsStream(filePath);
    
            if (in != null) {
    
                return in;
    
            }
    
            return null;
    
        }
    
        /**
    
     * 创建测试用例过程记录的渲染策略:解析对应的html并输出到word
    
     * @author xuwangcheng
    
     * @date 2019/7/26 10:10
    
     * @param
    
     * @return {@link AbstractRenderPolicy}
    
     */
    
        private static AbstractRenderPolicy createHtmlRenderPolicy() {
    
            return new AbstractRenderPolicy() {
    
                @Override
    
                protected void afterRender(RenderContext context) {
    
                    // 清空模板标签所在段落
    
                    clearPlaceholder(context, true);
    
                }
    
                @Override
    
                public void doRender(RunTemplate runTemplate, Object data, XWPFTemplate template) throws Exception {
    
                    if (data == null || StringUtils.isBlank(data.toString())) {
    
                        return;
    
                    }
    
                    //获得Apache POI增强类NiceXWPFDocument
    
                    NiceXWPFDocument doc = template.getXWPFDocument();
    
                    String html = data.toString();
    
                    html = html.replaceAll(">", ">")
    
                            .replaceAll("<", "<")
    
                            .replaceAll(" ", " ")
    
                            .replaceAll("\\n", "")
    
                            .replaceAll("", "\n");
    
                    org.jsoup.nodes.Document htmlDoc = Jsoup.parse(html);
    
                    Elements nodes = htmlDoc.body().children();
    
                    XWPFParagraph xwpfParagraph = runTemplate.getRun().getParagraph();
    
                    ListIterator<Element> itr = nodes.listIterator();
    
                    while (itr.hasNext()) {
    
                        Element e = itr.next();
    
                        xwpfParagraph = parseHtmlToWord(e, doc, xwpfParagraph, true);
    
                    }
    
                }
    
            };
    
        }
    
        /**
    
     * 转换整个html内容为word内容
    
     * @author xuwangcheng
    
     * @date 2019/7/29 18:46
    
     * @param ele ele
    
     * @param doc doc
    
     * @param xwpfParagraph xwpfParagraph
    
     * @return {@link XWPFParagraph}
    
     */
    
        private static XWPFParagraph parseHtmlToWord(Element ele, NiceXWPFDocument doc, XWPFParagraph xwpfParagraph
    
                , boolean isParent) throws Exception {
    
            //处理img图片
    
            if ("img".equals(ele.tagName())) {
    
                parseImgToWord(ele.attr("src"), xwpfParagraph);
    
                return xwpfParagraph;
    
            }
    
            //处理table标签
    
            if ("table".equals(ele.tagName())) {
    
                xwpfParagraph = doc.insertNewParagraph(CollUtil.getLast(xwpfParagraph.getRuns()));
    
                parseTableToWord(doc, ele, xwpfParagraph.createRun());
    
                //有表格的话新建段落
    
                //xwpfParagraph = doc.createParagraph();
    
                return xwpfParagraph;
    
            }
    
            //处理标签 上标
    
            if ("sup".equalsIgnoreCase(ele.tagName())) {
    
                XWPFRun run = xwpfParagraph.createRun();
    
                run.setText(ele.text());
    
                // 设置字体加粗;
    
                run.setBold(true);
    
                // 设置字体大小;
    
                run.setFontSize(12);
    
                run.setFontFamily("Times New Roman", XWPFRun.FontCharRange.ascii);
    
                run.setFontFamily("宋体", XWPFRun.FontCharRange.eastAsia);
    
                run.setSubscript(VerticalAlign.SUPERSCRIPT);
    
                TextRenderPolicy.Helper.renderTextRun(run, new TextRenderData(ele.text()));
    
                return xwpfParagraph;
    
            }
    
            //处理其他文本标签
    
            String text = ele.ownText();
    
            boolean continueItr = true;
    
            //span标签默认全部为文字,不再继续迭代
    
            if ("span".equalsIgnoreCase(ele.tagName())) {
    
                text = ele.wholeText();
    
                continueItr = false;
    
            }
    
            boolean enabledBreak = (isParent || StringUtils.isNotBlank(ele.text()))
    
                    && ReUtil.isMatch("(p|h[12345]|li|img)", ele.tagName());
    
            if (enabledBreak) {
    
                XWPFRun run = xwpfParagraph.createRun();
    
                run.addBreak();
    
            }
    
            if (StringUtils.isNotBlank(text)) {
    
                XWPFRun run = xwpfParagraph.createRun();
    
                TextRenderPolicy.Helper.renderTextRun(run, new TextRenderData(text));
    
            }
    
            if (continueItr && ele.children().size() > 0) {
    
                ListIterator<Element> itr = ele.children().listIterator();
    
                while (itr.hasNext()) {
    
                    Element me = itr.next();
    
                    xwpfParagraph = parseHtmlToWord(me, doc, xwpfParagraph, false);
    
                }
    
            }
    
            return xwpfParagraph;
    
        }
    
        /**
    
     * 转换图片为word内容
    
     * @author xuwangcheng
    
     * @date 2019/7/29 18:45
    
     * @param imgUrl imgUrl
    
     * @param xwpfParagraph xwpfParagraph
    
     */
    
        private static void parseImgToWord(String imgUrl, XWPFParagraph xwpfParagraph) throws Exception {
    
            //获取图片本地路径
    
           String imgRealPath = getImgRealPath(imgUrl);
    
           if (StringUtils.isBlank(imgRealPath) || !FileUtil.exist(imgRealPath)) {
    
               return;
    
           }
    
            //插入图片
    
            //获取图片对象
    
            BufferedImage img = ImageIO.read(new File(imgRealPath));
    
            //获得图片的宽
    
            int width = img.getWidth();
    
            //获得图片的高
    
            int height = img.getHeight();
    
            if (width > 600) {
    
                //获取比例
    
                int rate = (width / 600 ) + 1;
    
                width = width / rate - 20;
    
                height = height / rate;
    
            }
    
            PictureRenderData pictureRenderData = new PictureRenderData(width, height, imgRealPath);
    
            XWPFRun run = xwpfParagraph.createRun();
    
            PictureRenderPolicy.Helper.renderPicture(run, pictureRenderData);
    
        }
    
        /**
    
     * 通过imgUrl获取本地图片路径
    
     * @author xuwangcheng
    
     * @date 2019/11/21 9:47
    
     * @param imgUrl imgUrl
    
     * @return {@link String}
    
     */
    
        private static String getImgRealPath (String imgUrl) {
    
            //TODO 获取real_path
    
            return "G:\\1508490175_417949.jpg";
    
        }
    
        /**
    
     * 转换表格为word内容
    
     * @author xuwangcheng
    
     * @date 2019/7/29 18:45
    
     * @param doc doc
    
     * @param ele ele
    
     */
    
        private static void parseTableToWord(NiceXWPFDocument doc, Element ele, XWPFRun run) throws Exception {
    
            //简化表格html
    
            org.jsoup.nodes.Document tableDoc = Jsoup.parse(simplifyTable(ele.outerHtml()));
    
            Elements trList = tableDoc.getElementsByTag("tr");
    
            Elements tdList = trList.get(0).getElementsByTag("td");
    
            //创建表格
    
            XWPFTable xwpfTable = doc.insertNewTable(run, trList.size(), tdList.size());
    
            //设置样式
    
            TableTools.widthTable(xwpfTable, MiniTableRenderData.WIDTH_A4_FULL, tdList.size());
    
            TableTools.borderTable(xwpfTable, 4);
    
            //写入表格行和列内容
    
            Map<String, Boolean>[][] array = new Map[trList.size()][tdList.size()];
    
            for (int row = 0; row < trList.size(); row++) {
    
                Element trElement = trList.get(row);
    
                Elements tds = trElement.getElementsByTag("td");
    
                for (int col = 0; col < tds.size(); col++) {
    
                    Element colElement = tds.get(col);
    
                    String colspan = colElement.attr("colspan");
    
                    String rowspan = colElement.attr("rowspan");
    
                    String style = colElement.attr("style");
    
                    StringBuilder styleSB = new StringBuilder();
    
                    if (!StringUtils.isEmpty(colspan)) {
    
                        int colCount = Integer.parseInt(colspan);
    
                        for (int i = 0; i < colCount - 1; i++) {
    
                            array[row][col + i + 1] = new HashMap<String, Boolean>();
    
                            array[row][col + i + 1].put("mergeCol", true);
    
                        }
    
                    }
    
                    if (!StringUtils.isEmpty(rowspan)) {
    
                        int rowCount = Integer.parseInt(rowspan);
    
                        for (int i = 0; i < rowCount - 1; i++) {
    
                            array[row + i + 1][col] = new HashMap<String, Boolean>();
    
                            array[row + i + 1][col].put("mergeRow", true);
    
                        }
    
                    }
    
                    XWPFTableCell tableCell = xwpfTable.getRow(row).getCell(col);
    
                    if (StringUtils.isEmpty(colspan)) {
    
                        if (col == 0) {
    
                            if (tableCell.getCTTc().getTcPr() == null) {
    
                                tableCell.getCTTc().addNewTcPr().addNewHMerge().setVal(STMerge.RESTART);
    
                            } else {
    
                                if (tableCell.getCTTc().getTcPr().getHMerge() == null) {
    
                                    tableCell.getCTTc().getTcPr().addNewHMerge().setVal(STMerge.RESTART);
    
                                } else {
    
                                    tableCell.getCTTc().getTcPr().getHMerge().setVal(STMerge.RESTART);
    
                                }
    
                            }
    
                        } else {
    
                            if (array[row][col] != null && array[row][col].get("mergeCol") != null && array[row][col].get("mergeCol")) {
    
                                if (tableCell.getCTTc().getTcPr() == null) {
    
                                    tableCell.getCTTc().addNewTcPr().addNewHMerge().setVal(STMerge.CONTINUE);
    
                                } else {
    
                                    if (tableCell.getCTTc().getTcPr().getHMerge() == null) {
    
                                        tableCell.getCTTc().getTcPr().addNewHMerge().setVal(STMerge.CONTINUE);
    
                                    } else {
    
                                        tableCell.getCTTc().getTcPr().getHMerge().setVal(STMerge.CONTINUE);
    
                                    }
    
                                }
    
                                continue;
    
                            } else {
    
                                if (tableCell.getCTTc().getTcPr() == null) {
    
                                    tableCell.getCTTc().addNewTcPr().addNewHMerge().setVal(STMerge.RESTART);
    
                                } else {
    
                                    if (tableCell.getCTTc().getTcPr().getHMerge() == null) {
    
                                        tableCell.getCTTc().getTcPr().addNewHMerge().setVal(STMerge.RESTART);
    
                                    } else {
    
                                        tableCell.getCTTc().getTcPr().getHMerge().setVal(STMerge.RESTART);
    
                                    }
    
                                }
    
                            }
    
                        }
    
                    } else {
    
                        if (tableCell.getCTTc().getTcPr() == null) {
    
                            tableCell.getCTTc().addNewTcPr().addNewHMerge().setVal(STMerge.RESTART);
    
                        } else {
    
                            if (tableCell.getCTTc().getTcPr().getHMerge() == null) {
    
                                tableCell.getCTTc().getTcPr().addNewHMerge().setVal(STMerge.RESTART);
    
                            } else {
    
                                tableCell.getCTTc().getTcPr().getHMerge().setVal(STMerge.RESTART);
    
                            }
    
                        }
    
                    }
    
                    if (StringUtils.isEmpty(rowspan)) {
    
                        if (array[row][col] != null && array[row][col].get("mergeRow") != null && array[row][col].get("mergeRow")) {
    
                            if (tableCell.getCTTc().getTcPr() == null) {
    
                                tableCell.getCTTc().addNewTcPr().addNewVMerge().setVal(STMerge.CONTINUE);
    
                            } else {
    
                                if (tableCell.getCTTc().getTcPr().getVMerge() == null) {
    
                                    tableCell.getCTTc().getTcPr().addNewVMerge().setVal(STMerge.CONTINUE);
    
                                } else {
    
                                    tableCell.getCTTc().getTcPr().getVMerge().setVal(STMerge.CONTINUE);
    
                                }
    
                            }
    
                            continue;
    
                        } else {
    
                            if (tableCell.getCTTc().getTcPr() == null) {
    
                                tableCell.getCTTc().addNewTcPr().addNewVMerge().setVal(STMerge.RESTART);
    
                            } else {
    
                                if (tableCell.getCTTc().getTcPr().getVMerge() == null) {
    
                                    tableCell.getCTTc().getTcPr().addNewVMerge().setVal(STMerge.RESTART);
    
                                } else {
    
                                    tableCell.getCTTc().getTcPr().getVMerge().setVal(STMerge.RESTART);
    
                                }
    
                            }
    
                        }
    
                    } else {
    
                        if (tableCell.getCTTc().getTcPr() == null) {
    
                            tableCell.getCTTc().addNewTcPr().addNewVMerge().setVal(STMerge.RESTART);
    
                        } else {
    
                            if (tableCell.getCTTc().getTcPr().getVMerge() == null) {
    
                                tableCell.getCTTc().getTcPr().addNewVMerge().setVal(STMerge.RESTART);
    
                            } else {
    
                                tableCell.getCTTc().getTcPr().getVMerge().setVal(STMerge.RESTART);
    
                            }
    
                        }
    
                    }
    
                    tableCell.removeParagraph(0);
    
                    XWPFParagraph paragraph = tableCell.addParagraph();
    
                    paragraph.setStyle(styleSB.toString());
    
                    if (!StringUtils.isEmpty(style) && style.contains("text-align:center")) {
    
                        paragraph.setAlignment(ParagraphAlignment.CENTER);
    
                    }
    
                    parseHtmlToWord(colElement, doc, paragraph, true);
    
                }
    
            }
    
        }
    
        /**
    
     * 简化html中的表格dom
    
     * @author xuwangcheng
    
     * @date 2019/7/29 18:39
    
     * @param tableContent tableContent
    
     * @return {@link String}
    
     */
    
        private static String simplifyTable(String tableContent) {
    
            if (StringUtils.isEmpty(tableContent)) {
    
                return null;
    
            }
    
            org.jsoup.nodes.Document tableDoc = Jsoup.parse(tableContent);
    
            Elements trElements = tableDoc.getElementsByTag("tr");
    
            if (trElements != null) {
    
                Iterator<Element> eleIterator = trElements.iterator();
    
                Integer rowNum = 0;
    
                // 针对于colspan操作
    
                while (eleIterator.hasNext()) {
    
                    rowNum++;
    
                    Element trElement = eleIterator.next();
    
                    //去除所有样式
    
                    trElement.removeAttr("class");
    
                    Elements tdElements = trElement.getElementsByTag("td");
    
                    List<Element> tdEleList = covertElements2List(tdElements);
    
                    for (int i = 0; i < tdEleList.size(); i++) {
    
                        Element curTdElement = tdEleList.get(i);
    
                        //去除所有样式
    
                        curTdElement.removeAttr("class");
    
                        Element ele = curTdElement.clone();
    
                        String colspanValStr = curTdElement.attr("colspan");
    
                        if (!StringUtils.isEmpty(colspanValStr)) {
    
                            ele.removeAttr("colspan");
    
                            Integer colspanVal = Integer.parseInt(colspanValStr);
    
                            for (int k = 0; k < colspanVal - 1; k++) {
    
                                curTdElement.after(ele.outerHtml());
    
                            }
    
                        }
    
                    }
    
                }
    
                // 针对于rowspan操作
    
                List<Element> trEleList = covertElements2List(trElements);
    
                Element firstTrEle = trElements.first();
    
                Elements tdElements = firstTrEle.getElementsByTag("td");
    
                Integer tdCount = tdElements.size();
    
                //获取该列下所有单元格
    
                for (int i = 0; i < tdElements.size(); i++) {
    
                    for (Element trElement : trEleList) {
    
                        List<Element> tdElementList = covertElements2List(trElement.getElementsByTag("td"));
    
                        try {
    
                            tdElementList.get(i);
    
                        } catch (Exception e) {
    
                            continue;
    
                        }
    
                        Node curTdNode = tdElementList.get(i);
    
                        Node cNode = curTdNode.clone();
    
                        String rowspanValStr = curTdNode.attr("rowspan");
    
                        if (!StringUtils.isEmpty(rowspanValStr)) {
    
                            cNode.removeAttr("rowspan");
    
                            Element nextTrElement = trElement.nextElementSibling();
    
                            Integer rowspanVal = Integer.parseInt(rowspanValStr);
    
                            for (int j = 0; j < rowspanVal - 1; j++) {
    
                                Node tempNode = cNode.clone();
    
                                List<Node> nodeList = new ArrayList<Node>();
    
                                nodeList.add(tempNode);
    
                                if (j > 0) {
    
                                    nextTrElement = nextTrElement.nextElementSibling();
    
                                }
    
                                Integer indexNum = i + 1;
    
                                if (i == 0) {
    
                                    indexNum = 0;
    
                                }
    
                                if (indexNum.equals(tdCount)) {
    
                                    nextTrElement.appendChild(tempNode);
    
                                } else {
    
                                    nextTrElement.insertChildren(indexNum, nodeList);
    
                                }
    
                            }
    
                        }
    
                    }
    
                }
    
            }
    
            Element tableEle = tableDoc.getElementsByTag("table").first();
    
            String tableHtml = tableEle.outerHtml();
    
            return tableHtml;
    
        }
    
        /**
    
     * 转换Elements为list
    
     * @author xuwangcheng
    
     * @date 2019/7/29 18:40
    
     * @param curElements curElements
    
     * @return {@link List}
    
     */
    
        private static List<Element> covertElements2List(Elements curElements){
    
            List<Element> elementList = new ArrayList<Element>();
    
            Iterator<Element> eleIterator = curElements.iterator();
    
            while(eleIterator.hasNext()){
    
                Element curlement = eleIterator.next();
    
                elementList.add(curlement);
    
            }
    
            return elementList;
    
        }
    
    }
    
    以前,我总以为自己是菜鸟,也总想着早起的鸟儿有虫吃。直到有一天我想飞,才愕然的发现自己没有翅膀和羽毛,我竟然是菜虫!早起的虫儿被鸟吃……原来,百足应该厚积薄发!
    validate