多篇“网页格式文章”根据第1篇的目录进行所有文章的同目录级别滚动

目录

      • 一、写作背景
      • 二、实现思路
      • 三、Maven依赖
      • 四、代码
      • 五、结果

一、写作背景

公司和某大学合作了一个编辑器项目,其中涉及两篇文章的比对功能,领导制定的逻辑是“点击左侧文章列表,右侧页面会展示多篇文章的详情页面,点击第1篇文章的目录,其他几篇文章也要滚动到同一级别目录上”

二、实现思路

  • 将文章(docx格式)使用onlyoffice的文章转换功能转成html网页
  • 根据正则表达式(h1~h6标签)从html网页中提取目录信息
  • 给所有目录按照“X-X-X”格式给目录起编号当做id属性
  • 生成目录结构树,每个目录对象中包含目录名称、id属性值、目录级别,可以用作前端展示目录,以及实现“写作背景”中提到的多篇目录联动
  • 修改原有html网页文件,往h1~h6标签上添加id属性,可以给前端同事展示用

三、Maven依赖

<dependency>
	<groupId>cn.wanghaomiaogroupId>
	<artifactId>JsoupXpathartifactId>
	<version>2.3.2version>
dependency>

<dependency>
	<groupId>org.projectlombokgroupId>
	<artifactId>lombokartifactId>
	<version>1.18.36version>
dependency>

四、代码

import lombok.Data;
import lombok.ToString;
import org.jsoup.Jsoup;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Test {
    public static void main(String[] args) {
        // 0、准备数据
        String html = "

1

1.1

1.1.1

1.2

1.2.1

1.2.2

2

2.1

2.2

2.2.1

2.2.1.1.1.1
"
; // 1、提取目录信息 List<Integer> levelList = new ArrayList<>(); List<String> nameList = new ArrayList<>(); List<String> headingHtmlList = new ArrayList<>(); extractHeading(html, levelList, nameList, headingHtmlList); // 打印目录信息 System.out.println("start 打印目录标题和级别"); for (int i = 0; i < nameList.size(); i++) { System.out.print("标题:" + nameList.get(i) + "、级别:" + levelList.get(i) + ";"); } System.out.println("\nend 打印目录标题和级别\n"); // 2、获取目录级别树 Heading heading = headingTree(levelList, nameList); // 3、获取目录对应id(对应h1~6标签的属性)属性值集合 List<String> ids = getHeadingIdList(heading); // 打印目录对应id属性值集合 System.out.println("start 打印目录对应id属性值集合"); System.out.println("目录对应id属性值集合:" + ids); System.out.println("end 打印目录对应id属性值集合\n"); // 4、往目录标签后面添加id属性值 String newHtml = addId2Html(html, headingHtmlList, ids); // 打印新html System.out.println("start 打印添加id属性后的html值"); System.out.println("添加id属性后的html值:" + newHtml); System.out.println("end 打印添加id属性后的html值\n"); // 打印目录级别树(在getHeadingIdList方法中填充了id属性,所以在此处打印目录级别树) System.out.println("start 打印目录级别树"); System.out.println("目录级别树:" + heading); System.out.println("end 打印目录级别树"); } /** * 将id属性值添加到html中 * * @author guoming * @date 2025/1/22 14:37 * @param oldHtml 原始网页文本 * @param headingHtmlList 提取出来的目录网页文本 * @param ids 目录id属性值集合 * @return 新网页文本 **/ private static String addId2Html(String oldHtml, List<String> headingHtmlList, List<String> ids) { for (int i = 0; i < headingHtmlList.size(); i++) { String headingHtml = headingHtmlList.get(i); int index = oldHtml.indexOf(headingHtml); StringBuilder sb = new StringBuilder(oldHtml); sb.insert(index + 4, "id=\\\"" + ids.get(i) + "\\\""); oldHtml = sb.toString(); } return oldHtml; } /** * 获取目录对应id(对应h1~6标签的属性)属性值集合 * * @author 明快de玄米61 * @date 2025/1/22 14:32 * @param heading 顶级目录对象 * @return 目录对应id(对应h1~6标签的属性)属性值集合 **/ private static List<String> getHeadingIdList(Heading heading) { List<String> ids = new ArrayList<>(); for (int i = 0; i < heading.getChildren().size(); i++) { Heading child = heading.getChildren().get(i); String nextPrefix = getNextLevelHeadingIdPrefix(child.getLevel(), 0, "", i + 1); String id = nextPrefix.substring(0, nextPrefix.length() - 1); child.setId(id); ids.add(id); generateHeadingId(child, nextPrefix, ids); } return ids; } /** * 获取目录信息 * * @author 明快de玄米61 * @date 2025/1/22 14:33 * @param html 最初网页 * @param levelList 目录级别集合 * @param nameList 目录名称集合 * @param headingHtmlList 单个目录网页代码集合 * @return **/ private static void extractHeading(String html, List<Integer> levelList, List<String> nameList, List<String> headingHtmlList) { Pattern pattern = Pattern.compile(""); Matcher matcher = pattern.matcher(html); while (matcher.find()) { levelList.add(Integer.valueOf(matcher.group(1))); // 获取目录名称集合 nameList.add(Jsoup.parse(matcher.group()).text().trim()); headingHtmlList.add(matcher.group()); } } /** * 组装目录级别树 * * @author 明快de玄米61 * @date 2025/1/22 14:12 * @param levelList 目录级别集合 * @param nameList 目录名称集合 * @return 目录级别树-顶节点 **/ private static Heading headingTree(List<Integer> levelList, List<String> nameList) { List<Heading> processed = new ArrayList<>(); Heading heading = new Heading(); heading.setChildren(new ArrayList<Heading>()); for (int i = 0; i < levelList.size(); i++) { Integer level = levelList.get(i); // 当前对象 Heading entity = new Heading(); entity.setLevel(level); entity.setName(nameList.get(i)); entity.setChildren(new ArrayList<Heading>()); // 获取父级对象 Heading parent = getParentHeading(processed, level); if (parent == null) { heading.getChildren().add(entity); } else { parent.getChildren().add(entity); } // 放入集合 processed.add(entity); } return heading; } /** * 获取下一个级别的id前缀 * * @author 明快de玄米61 * @date 2025/1/22 14:29 * @param currentLevel 当前目录级别 * @param parentLevel 父级目录级别 * @return 下一个级别的id前缀 **/ private static String getNextLevelHeadingIdPrefix(Integer currentLevel, Integer parentLevel, String prefix, Integer sort) { StringBuilder sb = new StringBuilder(); if (prefix.length() == 0) { sb.append(sort).append("-"); for (Integer i = 2; i <= currentLevel; i++) { sb.append(1).append("-"); } } else { if (currentLevel - 1 > parentLevel) { sb.append(prefix); for (Integer i = parentLevel + 2; i <= currentLevel; i++) { sb.append(1).append("-"); } sb.append(sort).append("-"); } else { sb.append(prefix).append(sort).append("-"); } } return sb.toString(); } /** * 生成目录对应的id,最后会添加到html中当做id属性值 * * @param entity 目录对象 * @param prefix 目录id前缀 **/ private static void generateHeadingId(Heading entity, String prefix, List<String> ids) { List<Heading> childs = entity.getChildren(); if (childs == null || childs.size() == 0) { return; } for (int i = 0; i < childs.size(); i++) { Heading child = childs.get(i); String newPrefix = getNextLevelHeadingIdPrefix(child.getLevel(), entity.getLevel(), prefix, i + 1); String id = newPrefix.substring(0, newPrefix.length() - 1); child.setId(id); ids.add(id); generateHeadingId(child, newPrefix, ids); } } /** * 获取父类目录(采用反向思维查找) * * @param headings 目录对象集合 * @param level 目录级别 * @return 父级目录对象 **/ private static Heading getParentHeading(List<Heading> headings, Integer level) { if (headings.size() == 0) { return null; } for (int i = headings.size() - 1; i >= 0; i--) { Heading entity = headings.get(i); if (entity.getLevel() < level) { return entity; } } return null; } } /** * 目录类 */ @Data @ToString class Heading { // 对应id属性值 private String id; // 名称 private String name; // 级别 private Integer level; // 子级目录集合 private List<Heading> children; }

五、结果

start 打印目录标题和级别
标题:1、级别:1;标题:1.1、级别:2;标题:1.1.1、级别:3;标题:1.2、级别:2;标题:1.2.1、级别:3;标题:1.2.2、级别:3;标题:2、级别:1;标题:2.1、级别:2;标题:2.2、级别:2;标题:2.2.1、级别:3;标题:2.2.1.1.1.1、级别:6;
end 打印目录标题和级别

start 打印目录对应id属性值集合
目录对应id属性值集合:[1, 1-1, 1-1-1, 1-2, 1-2-1, 1-2-2, 2, 2-1, 2-2, 2-2-1, 2-2-1-1-1-1]
end 打印目录对应id属性值集合

start 打印添加id属性后的html值
添加id属性后的html值:<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /></head><body><h1 id=\"1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:18pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:20pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1</span></h1><h2 id=\"1-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1.1</span></h2><h3 id=\"1-1-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1.1.1</span></h3><h2 id=\"1-2\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1.2</span></h2><h3 id=\"1-2-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1.2.1</span></h3><h3 id=\"1-2-2\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">1.2.2</span></h3><h1 id=\"2\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:18pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:20pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">2</span></h1><h2 id=\"2-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">2.1</span></h2><h2 id=\"2-2\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:16pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">2.2</span></h2><h3 id=\"2-2-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:8pt;margin-bottom:4pt;border:none;mso-border-left-alt:none;mso-border-top-alt:none;mso-border-right-alt:none;mso-border-bottom-alt:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:14pt;color:#2e75b5;mso-style-textfill-fill-color:#2e75b5\">2.2.1</span></h3><h6 id=\"2-2-1-1-1-1\"style=\"mso-pagination:widow-orphan lines-together;page-break-after:avoid;margin-top:2pt;margin-bottom:0pt;border:none;border-left:none;border-top:none;border-right:none;border-bottom:none;mso-border-between:none\"><span style=\"font-family:'Arial';font-size:11pt;color:#595959;mso-style-textfill-fill-color:#595959\"><i>2.2.1.1.1.1</i></span></h6></body></html>
end 打印添加id属性后的html值

start 打印目录级别树
目录级别树:Heading(id=null, name=null, level=null, children=[Heading(id=1, name=1, level=1, children=[Heading(id=1-1, name=1.1, level=2, children=[Heading(id=1-1-1, name=1.1.1, level=3, children=[])]), Heading(id=1-2, name=1.2, level=2, children=[Heading(id=1-2-1, name=1.2.1, level=3, children=[]), Heading(id=1-2-2, name=1.2.2, level=3, children=[])])]), Heading(id=2, name=2, level=1, children=[Heading(id=2-1, name=2.1, level=2, children=[]), Heading(id=2-2, name=2.2, level=2, children=[Heading(id=2-2-1, name=2.2.1, level=3, children=[Heading(id=2-2-1-1-1-1, name=2.2.1.1.1.1, level=6, children=[])])])])])
end 打印目录级别树

你可能感兴趣的:(java学习之路,java)