/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.text.wikipedia;

import java.io.IOException;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.GenericsUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WikipediaMapper
extends Mapper<LongWritable, Text, Text, Text> {
    private static final Logger log = LoggerFactory.getLogger(WikipediaMapper.class);
    private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s]");
    private static final String START_DOC = "<text xml:space=\"preserve\">";
    private static final String END_DOC = "</text>";
    private static final Pattern TITLE = Pattern.compile("<title>(.*)<\\/title>");
    private static final String REDIRECT = "<redirect />";
    private Set<String> inputCategories;
    private boolean exactMatchOnly;
    private boolean all;
    private boolean removeLabels;

    protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException {
        String title;
        String document;
        String content = value.toString();
        if (content.contains(REDIRECT)) {
            return;
        }
        try {
            document = WikipediaMapper.getDocument(content);
            title = WikipediaMapper.getTitle(content);
        }
        catch (RuntimeException e) {
            return;
        }
        String catMatch = this.findMatchingCategory(document);
        if (!this.all && "Unknown".equals(catMatch)) {
            return;
        }
        document = StringEscapeUtils.unescapeHtml4((String)document);
        if (this.removeLabels && (document = this.removeCategoriesFromText(document)) == null) {
            return;
        }
        String category = "/" + catMatch.toLowerCase(Locale.ENGLISH) + "/" + SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_");
        context.write((Object)new Text(category), (Object)new Text(document));
    }

    protected void setup(Mapper.Context context) throws IOException, InterruptedException {
        super.setup(context);
        Configuration conf = context.getConfiguration();
        HashSet newCategories = new HashSet();
        DefaultStringifier setStringifier = new DefaultStringifier(conf, GenericsUtil.getClass(newCategories));
        String categoriesStr = conf.get("wikipedia.categories");
        this.inputCategories = (Set)setStringifier.fromString(categoriesStr);
        this.exactMatchOnly = conf.getBoolean("exact.match.only", false);
        this.all = conf.getBoolean("all.files", false);
        this.removeLabels = conf.getBoolean("remove.labels", false);
        log.info("Configure: Input Categories size: {} All: {} Exact Match: {} Remove Labels from Text: {}", new Object[]{this.inputCategories.size(), this.all, this.exactMatchOnly, this.removeLabels});
    }

    private static String getDocument(String xml) {
        int start = xml.indexOf(START_DOC) + START_DOC.length();
        int end = xml.indexOf(END_DOC, start);
        return xml.substring(start, end);
    }

    private static String getTitle(CharSequence xml) {
        Matcher m = TITLE.matcher(xml);
        return m.find() ? m.group(1) : "";
    }

    private String findMatchingCategory(String document) {
        int endIndex;
        int categoryIndex;
        int startIndex = 0;
        while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1 && (endIndex = document.indexOf("]]", categoryIndex += 11)) < document.length() && endIndex >= 0) {
            String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim();
            if (this.exactMatchOnly && this.inputCategories.contains(category)) {
                return category.toLowerCase(Locale.ENGLISH);
            }
            if (!this.exactMatchOnly) {
                for (String inputCategory : this.inputCategories) {
                    if (!category.contains(inputCategory)) continue;
                    return inputCategory.toLowerCase(Locale.ENGLISH);
                }
            }
            startIndex = endIndex;
        }
        return "Unknown";
    }

    private String removeCategoriesFromText(String document) {
        int startIndex = 0;
        try {
            int endIndex;
            int categoryIndex;
            while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1 && (endIndex = document.indexOf("]]", categoryIndex)) < document.length() && endIndex >= 0 && categoryIndex < (document = document.replace(document.substring(categoryIndex, endIndex + 2), "")).length()) {
                startIndex = categoryIndex;
            }
        }
        catch (StringIndexOutOfBoundsException e) {
            return null;
        }
        return document;
    }
}

