package com.metaeffekt.artifact.analysis.preprocess.filter.wordlist;

import com.metaeffekt.artifact.terms.model.Evidence;
import com.metaeffekt.artifact.terms.model.Masks;
import com.metaeffekt.artifact.terms.model.MatchSet;
import com.metaeffekt.artifact.terms.model.NormalizationMetaData;
import com.metaeffekt.artifact.terms.model.TermsMetaData;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.OpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/metaeffekt/artifact/analysis/preprocess/filter/wordlist/WordlistGenerator.class */
public class WordlistGenerator {
    private static final Logger log = LoggerFactory.getLogger(WordlistGenerator.class);
    protected static final Pattern SPLIT_PATTERN = Pattern.compile("(\\p{Zs}|\\R|\\p{P}|\t|\\p{Sk}|\\p{Sm}|\\p{Cf})");

    public static Collection<String> createWordlist(NormalizationMetaData normalizationMetaData) throws IOException {
        return createWordlist(normalizationMetaData.getLicenseMetaDataMap());
    }

    public static List<String> createWordlist(Map<String, TermsMetaData> map) throws IOException {
        ArrayList arrayList = new ArrayList();
        arrayList.addAll(getLicenseTextWords(map));
        arrayList.addAll(getMetadataWords(map));
        arrayList.removeIf((v0) -> {
            return Objects.isNull(v0);
        });
        arrayList.removeIf((v0) -> {
            return v0.isEmpty();
        });
        return arrayList;
    }

    public static Collection<String> getMetadataWords(Map<String, TermsMetaData> map) {
        ArrayList<TermsMetaData> arrayList = new ArrayList(map.values());
        HashSet<String> hashSet = new HashSet();
        HashSet<String> hashSet2 = new HashSet();
        for (TermsMetaData termsMetaData : arrayList) {
            hashSet.addAll(termsMetaData.getAlternativeNames());
            hashSet.addAll(termsMetaData.getCanonicalNameHistory());
            hashSet.add(termsMetaData.getCanonicalName());
            hashSet.add(termsMetaData.getSpdxIdentifier());
            hashSet.add(termsMetaData.getShortName());
            Masks masks = termsMetaData.getMasks();
            if (masks != null) {
                hashSet2.addAll(masks.getMatches());
            }
            Map<String, String> mappings = termsMetaData.getMappings();
            if (termsMetaData.getMappings() != null) {
                hashSet2.addAll(mappings.keySet());
                hashSet2.addAll(mappings.values());
            }
            Evidence evidence = termsMetaData.getEvidence();
            if (evidence != null) {
                if (evidence.getMatches() != null) {
                    hashSet2.addAll(evidence.getMatches());
                }
                if (evidence.getExcludes() != null) {
                    hashSet2.addAll(evidence.getExcludes());
                }
                List<MatchSet> oneOf = evidence.getOneOf();
                if (oneOf != null) {
                    for (MatchSet matchSet : oneOf) {
                        if (matchSet != null && matchSet.getMatches() != null && !matchSet.getMatches().isEmpty()) {
                            hashSet2.addAll(matchSet.getMatches());
                        }
                    }
                }
            }
        }
        HashSet hashSet3 = new HashSet();
        for (String str : hashSet) {
            if (!shouldSkip(str)) {
                hashSet3.add(str);
            }
        }
        for (String str2 : hashSet2) {
            if (!shouldSkip(str2)) {
                for (String str3 : getSplit(str2)) {
                    if (!shouldSkip(str3)) {
                        hashSet3.add(str3);
                    }
                }
            }
        }
        return hashSet3;
    }

    public static String[] getSplit(String str) {
        return SPLIT_PATTERN.split(str);
    }

    public static boolean shouldSkip(String str) {
        if (str == null) {
            return true;
        }
        return str.length() < 4 ? str.isEmpty() || str.codePoints().noneMatch(Character::isIdeographic) : str.matches("\\d+");
    }

    public static List<String> finalizeWordCollection(Map<String, AtomicInteger> map) {
        map.remove("");
        List<String> list = (List) map.entrySet().stream().sorted((entry, entry2) -> {
            return ((AtomicInteger) entry2.getValue()).get() - ((AtomicInteger) entry.getValue()).get();
        }).map((v0) -> {
            return v0.getKey();
        }).collect(Collectors.toList());
        List list2 = (List) list.stream().filter(str -> {
            return str.length() <= 1;
        }).collect(Collectors.toList());
        log.debug("Number of short \"words\": " + list2.size());
        log.debug(list2.toString());
        log.debug("Number of words generated from license texts: " + list.size());
        return list;
    }

    public static Collection<String> getLicenseTextWords(Map<String, TermsMetaData> map) throws IOException {
        ConcurrentHashMap concurrentHashMap = new ConcurrentHashMap();
        ConcurrentSkipListSet concurrentSkipListSet = new ConcurrentSkipListSet();
        for (Map.Entry<String, TermsMetaData> entry : map.entrySet()) {
            String key = entry.getKey();
            String licenseFile = entry.getValue().getLicenseFile();
            if (licenseFile != null) {
                if (StringUtils.isBlank(licenseFile)) {
                    log.debug("Non-null but blank licenseFilePathString in '" + key + "'. Skipping.");
                } else {
                    File file = new File(licenseFile);
                    if (!Files.isRegularFile(file.toPath(), new LinkOption[0])) {
                        log.error("Failed to include [{}]'s license at [{}]: Not a file.", entry.getKey(), file);
                    }
                    InputStream newInputStream = Files.newInputStream(file.toPath(), new OpenOption[0]);
                    Throwable th = null;
                    try {
                        try {
                            String iOUtils = IOUtils.toString(newInputStream, StandardCharsets.UTF_8);
                            if (newInputStream != null) {
                                if (0 != 0) {
                                    try {
                                        newInputStream.close();
                                    } catch (Throwable th2) {
                                        th.addSuppressed(th2);
                                    }
                                } else {
                                    newInputStream.close();
                                }
                            }
                            concurrentSkipListSet.add(Integer.valueOf(Arrays.stream(iOUtils.split("\n")).mapToInt((v0) -> {
                                return v0.length();
                            }).max().orElse(0)));
                            if (log.isTraceEnabled() && 8192 >= Arrays.stream(iOUtils.split("\n")).mapToInt((v0) -> {
                                return v0.length();
                            }).max().orElse(0)) {
                                log.trace("very long line(s) in file [{}].", licenseFile);
                            }
                            if (iOUtils.contains("�")) {
                                log.debug("Unicode 'REPLACEMENT CHARACTER' found in license text at [{}].", licenseFile);
                            }
                            for (String str : getSplit(iOUtils)) {
                                String lowerCase = str.toLowerCase(Locale.ENGLISH);
                                if (!shouldSkip(lowerCase)) {
                                    concurrentHashMap.putIfAbsent(lowerCase, new AtomicInteger());
                                    ((AtomicInteger) concurrentHashMap.get(lowerCase)).getAndIncrement();
                                }
                            }
                        } finally {
                        }
                    } catch (Throwable th3) {
                        if (newInputStream != null) {
                            if (th != null) {
                                try {
                                    newInputStream.close();
                                } catch (Throwable th4) {
                                    th.addSuppressed(th4);
                                }
                            } else {
                                newInputStream.close();
                            }
                        }
                        throw th3;
                    }
                }
            }
        }
        log.debug("Longest line found: " + concurrentSkipListSet.stream().max((v0, v1) -> {
            return v0.compareTo(v1);
        }).orElse(0) + " code units.");
        return finalizeWordCollection(concurrentHashMap);
    }
}
