package com.metaeffekt.artifact.analysis.preprocess.filter;

import com.metaeffekt.artifact.analysis.preprocess.filter.charfile.CharFileHelper;
import com.metaeffekt.artifact.analysis.preprocess.filter.charfile.CharSequenceFile;
import com.metaeffekt.artifact.analysis.preprocess.filter.matcher.StringMatchGeneratorAhoCorasick;
import com.metaeffekt.artifact.analysis.preprocess.filter.range.IntegerDiscreteRangeMap;
import com.metaeffekt.artifact.terms.model.NormalizationMetaData;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/metaeffekt/artifact/analysis/preprocess/filter/TextSieve.class */
public class TextSieve {
    protected final StringMatchGeneratorAhoCorasick generator;
    protected final int includeReach;
    protected final List<String> wordlist;

    /* loaded from: input_file:com/metaeffekt/artifact/analysis/preprocess/filter/TextSieve$TextSieveBuilder.class */
    public static class TextSieveBuilder {
        public static final Logger LOG = LoggerFactory.getLogger(TextSieveBuilder.class);
        private List<String> wordlist = null;
        private int includeReach = 128;

        public TextSieveBuilder includeReach(int i) {
            this.includeReach = i;
            return this;
        }

        public TextSieveBuilder wordlist(Collection<String> collection) {
            ArrayList arrayList = new ArrayList(collection);
            if (arrayList.removeIf((v0) -> {
                return Objects.isNull(v0);
            })) {
                LOG.warn("Removed null word from wordlist while loading into TextSieveBuilder.");
            }
            if (arrayList.removeIf((v0) -> {
                return v0.isEmpty();
            })) {
                LOG.warn("Removed empty word from wordlist while loading into TextSieveBuilder.");
            }
            this.wordlist = Collections.unmodifiableList(arrayList);
            return this;
        }

        public TextSieveBuilder wordlist(NormalizationMetaData normalizationMetaData) {
            return wordlist(Collections.unmodifiableCollection(normalizationMetaData.getWordlist()));
        }

        public TextSieve build() {
            if (this.wordlist == null) {
                throw new IllegalArgumentException("Can't build: wordlist is unset.");
            }
            if (this.wordlist.size() == 0) {
                throw new IllegalArgumentException("Can't build: wordlist is empty.");
            }
            return new TextSieve(this);
        }
    }

    protected TextSieve(TextSieveBuilder textSieveBuilder) {
        this.includeReach = textSieveBuilder.includeReach;
        this.wordlist = textSieveBuilder.wordlist;
        this.generator = new StringMatchGeneratorAhoCorasick(this.wordlist, Integer.valueOf(this.includeReach));
    }

    public static TextSieveBuilder builder() {
        return new TextSieveBuilder();
    }

    public StringBuilder loadFiltered(File file, Charset charset, File file2) throws IOException {
        return loadFilteredToStringBuilder(file, charset, file2);
    }

    protected StringBuilder loadFilteredToStringBuilder(File file, Charset charset, File file2) throws IOException {
        CharFileHelper createDataFile = CharFileHelper.createDataFile(file, new File(file2, UUID.randomUUID().toString() + StringUtils.truncate(file.getName().replaceAll("\\W", "_"), 128) + ".jchars.bin"), charset, true);
        Throwable th = null;
        try {
            CharSequenceFile charSequenceFile = new CharSequenceFile(createDataFile, false);
            Throwable th2 = null;
            try {
                try {
                    IntegerDiscreteRangeMap<Integer> matchMap = this.generator.getMatchMap(charSequenceFile);
                    StringBuilder sb = new StringBuilder();
                    for (Map.Entry<Integer, Integer> entry : matchMap.subRangeSet(0, Integer.valueOf(charSequenceFile.length() - 1))) {
                        sb.append((CharSequence) charSequenceFile, entry.getKey().intValue(), entry.getValue().intValue() + 1).append(NormalizationMetaData.STRING_WHITESPACE);
                    }
                    if (charSequenceFile != null) {
                        if (0 != 0) {
                            try {
                                charSequenceFile.close();
                            } catch (Throwable th3) {
                                th2.addSuppressed(th3);
                            }
                        } else {
                            charSequenceFile.close();
                        }
                    }
                    return TextFilterFunctions.removeInvalidUnicode(sb);
                } finally {
                }
            } catch (Throwable th4) {
                if (charSequenceFile != null) {
                    if (th2 != null) {
                        try {
                            charSequenceFile.close();
                        } catch (Throwable th5) {
                            th2.addSuppressed(th5);
                        }
                    } else {
                        charSequenceFile.close();
                    }
                }
                throw th4;
            }
        } finally {
            if (createDataFile != null) {
                if (0 != 0) {
                    try {
                        createDataFile.close();
                    } catch (Throwable th6) {
                        th.addSuppressed(th6);
                    }
                } else {
                    createDataFile.close();
                }
            }
        }
    }

    public StringBuilder loadFilteredDirectly(File file, Charset charset) throws IOException {
        return loadFilteredDirectlyToStringBuilder(file, charset);
    }

    public StringBuilder loadFilteredDirectlyToStringBuilder(File file, Charset charset) throws IOException {
        String readFileToString = FileUtils.readFileToString(file, charset);
        IntegerDiscreteRangeMap<Integer> matchMap = this.generator.getMatchMap(readFileToString);
        StringBuilder sb = new StringBuilder();
        for (Map.Entry<Integer, Integer> entry : matchMap.subRangeSet(0, Integer.valueOf(readFileToString.length() - 1))) {
            sb.append((CharSequence) readFileToString, entry.getKey().intValue(), entry.getValue().intValue() + 1).append(NormalizationMetaData.STRING_WHITESPACE);
        }
        return TextFilterFunctions.removeInvalidUnicode(sb);
    }
}
