SpringBoot2.3.x整合ElasticSearch7.6.2 实现PDF,WORD全文检索

     阅读:22

1、下载安装,只下载elasticSearch、Kibana即可

这里我使用7.6.2的elasticsearch版本, 因为项目使用的springboot2.3.x,避免低版本客户端,高版本索引库·,这里我先退回使用低版本索引库
在这里插入图片描述
在这里插入图片描述

插件安装

在这里插入图片描述

插件下载完成之后,将压缩包解压到 elasticsearch的plugins目录, 之后重启elasticsearch
在这里插入图片描述
在这里插入图片描述

定义文本抽取管道

PUT /_ingest/pipeline/attachment
{
 "description" : "Extract attachment information",
 "processors":[
 {
    "attachment":{

        "field":"data",

        "indexed_chars" : -1,

        "ignore_missing":true
     }
 },
 {
     "remove":{"field":"data"}
 }]}

2、SpringBoot整合ElasticSearch

<dependencies>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-web</artifactId>
    </dependency>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-test</artifactId>
    </dependency>
    <dependency>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
    </dependency>
    <dependency>
        <groupId>com.alibaba</groupId>
        <artifactId>fastjson</artifactId>
        <version>1.2.58</version>
    </dependency>
    <dependency>
        <groupId>org.projectlombok</groupId>
        <artifactId>lombok</artifactId>
        <version>1.18.20</version>
    </dependency>
</dependencies>

application.yml

server:
  port: 9090
spring:
  application:
    name: elasticsearch-service
  elasticsearch:
    rest:
      uris: http://127.0.0.1:9200

实体类

package top.fate.entity;

import lombok.Data;
import org.springframework.data.elasticsearch.annotations.Document;
import org.springframework.data.elasticsearch.annotations.Field;
import org.springframework.data.elasticsearch.annotations.FieldType;

/**
 * @auther:Wangxl
 * @Emile:18335844494@163.com
 * @Time:2020/11/2 14:15
 */
@Data
@Document(indexName = "filedata")
public class FileData {

    @Field(type = FieldType.Keyword)
    private String filePk;
    @Field(type = FieldType.Keyword)
    private String fileName;
    @Field(type = FieldType.Keyword)
    private Integer page;
    @Field(type = FieldType.Keyword)
    private String departmentId;
    @Field(type = FieldType.Keyword)
    private String ljdm;
    @Field(type = FieldType.Text, analyzer = "ik_max_word")
    private String data;
    @Field(type = FieldType.Keyword)
    private String realName;
    @Field(type = FieldType.Keyword)
    private String url;
    @Field(type = FieldType.Keyword)
    private String type;
}

接口类

package top.fate.controller;

import com.alibaba.fastjson.JSON;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.elasticsearch.core.ElasticsearchRestTemplate;
import org.springframework.data.elasticsearch.core.IndexOperations;
import org.springframework.data.elasticsearch.core.document.Document;
import org.springframework.data.elasticsearch.core.mapping.IndexCoordinates;
import org.springframework.util.Base64Utils;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import top.fate.entity.FileData;

import java.io.File;
import java.io.FileInputStream;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/**
 * @auther:Wangxl
 * @Emile:18335844494@163.com
 * @Time:2022/6/1 16:33
 */
@RestController
@RequestMapping(value = "fullTextSearch")
public class FullTextSearchController {
    @Autowired
    private ElasticsearchRestTemplate elasticsearchRestTemplate;
    @Autowired
    private RestHighLevelClient restHighLevelClient;

    @GetMapping("createIndex")
    public void add() {

        IndexOperations indexOperations = elasticsearchRestTemplate.indexOps(IndexCoordinates.of("testindex"));
        indexOperations.create();
        Document mapping = indexOperations.createMapping(FileData.class);
        indexOperations.putMapping(mapping);
    }

    @GetMapping("deleteIndex")
    public void deleteIndex() {
        IndexOperations indexOperations = elasticsearchRestTemplate.indexOps(FileData.class);
        indexOperations.delete();
    }

    @GetMapping("uploadFileToEs")
    public void uploadFileToEs() {

        try {
//            File file = new File("D:\\desktop\\Java开发工程师-4年-王晓龙-2022-05.pdf");
            File file = new File("D:\\desktop\\Java开发工程师-4年-王晓龙-2022-05.docx");
            FileInputStream inputFile = new FileInputStream(file);
            byte[] buffer = new byte[(int)file.length()];
            inputFile.read(buffer);
            inputFile.close();
            //将文件转成base64编码
            String fileString = Base64Utils.encodeToString(buffer);

            FileData fileData = new FileData();
            fileData.setFileName(file.getName());
            fileData.setFilePk(file.getName());
            fileData.setData(fileString);

            IndexRequest indexRequest = new IndexRequest("testindex").id(fileData.getFilePk());
            indexRequest.source(JSON.toJSONString(fileData),XContentType.JSON);
            indexRequest.setPipeline("attachment");

            IndexResponse index = restHighLevelClient.index(indexRequest, RequestOptions.DEFAULT);

            return;

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    @GetMapping("search")
    public Object search(@RequestParam("txt") String txt) {
        List list = new ArrayList();
        try {
            SearchRequest searchRequest = new SearchRequest("testindex");

            SearchSourceBuilder builder = new SearchSourceBuilder();

            builder.query(QueryBuilders.matchQuery("attachment.content",txt).analyzer("ik_max_word"));

            searchRequest.source(builder);


            // 返回实际命中数
            builder.trackTotalHits(true);
            //高亮
            HighlightBuilder highlightBuilder = new HighlightBuilder();
            highlightBuilder.field("attachment.content");
            highlightBuilder.requireFieldMatch(false);//多个高亮关闭
            highlightBuilder.preTags("<span style='color:red'>");
            highlightBuilder.postTags("</span>");
            builder.highlighter(highlightBuilder);

            SearchResponse search = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);

            if (search.getHits() != null) {

                for (SearchHit documentFields : search.getHits().getHits()) {
                    Map<String, HighlightField> highlightFields = documentFields.getHighlightFields();
                    HighlightField title = highlightFields.get("attachment.content");
                    Map<String, Object> sourceAsMap = documentFields.getSourceAsMap();
                    if (title != null) {
                        Text[] fragments = title.fragments();
                        String n_title = "";
                        for (Text fragment : fragments) {
                            n_title += fragment;
                        }
                        sourceAsMap.put("data", n_title);
                    }
                    list.add(dealObject(sourceAsMap,  FileData.class));
                }

            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return list;
    }
    /*public static void ignoreSource(Map<String, Object> map) {
        for (String key : IGNORE_KEY) {
            map.remove(key);
        }
    }*/

    public static <T> T dealObject(Map<String, Object> sourceAsMap, Class<T> clazz) {
        try {
//            ignoreSource(sourceAsMap);
            Iterator<String> keyIterator = sourceAsMap.keySet().iterator();
            T t = clazz.newInstance();
            while (keyIterator.hasNext()) {
                String key = keyIterator.next();
                String replaceKey = key.replaceFirst(key.substring(0, 1), key.substring(0, 1).toUpperCase());
                Method method = null;
                try {
                    method = clazz.getMethod("set" + replaceKey, sourceAsMap.get(key).getClass());
                } catch (NoSuchMethodException e) {
                    continue;
                }
                method.invoke(t, sourceAsMap.get(key));
            }
            return t;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }
}

测试

创建索引

 localhost:9090/fullTextSearch/createIndex

在这里插入图片描述

上传文档

localhost:9090/fullTextSearch/uploadFileToEs

在这里插入图片描述

搜索

localhost:9090/fullTextSearch/search?txt=索引库

在这里插入图片描述