/*
 * Decompiled with CFR 0.152.
 */
package com.googlecode.clearnlp.run;

import com.googlecode.clearnlp.engine.EngineGetter;
import com.googlecode.clearnlp.run.AbstractRun;
import com.googlecode.clearnlp.segmentation.AbstractSegmenter;
import com.googlecode.clearnlp.tokenization.AbstractTokenizer;
import com.googlecode.clearnlp.util.UTArray;
import com.googlecode.clearnlp.util.UTInput;
import com.googlecode.clearnlp.util.UTOutput;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.List;
import org.kohsuke.args4j.Option;

public class Tokenizer
extends AbstractRun {
    @Option(name="-i", usage="input path (required)", required=true, metaVar="<filepath>")
    private String s_inputPath;
    @Option(name="-ie", usage="input file extension (default: .*)", required=false, metaVar="<regex>")
    private String s_inputExt = ".*";
    @Option(name="-oe", usage="output file extension (default: tok)", required=false, metaVar="<string>")
    private String s_outputExt = "tok";
    @Option(name="-l", usage="language (default: en)", required=false, metaVar="<language>")
    private String s_language = "en";
    @Option(name="-d", usage="name of a dictionary file (required)", required=true, metaVar="<filename>")
    private String s_dictFile;
    @Option(name="-if", usage="input format (default: raw)", required=false, metaVar="<string>")
    private String i_format = "raw";
    @Option(name="-of", usage="output format (default: line)", required=false, metaVar="<string>")
    private String o_format = "line";
    @Option(name="-twit", usage="if set, tokenize for twits", required=false, metaVar="<boolean>")
    protected boolean b_twit;

    public Tokenizer() {
    }

    public Tokenizer(String[] args) {
        this.initArgs(args);
        AbstractTokenizer tokenizer = EngineGetter.getTokenizer(this.s_language, this.s_dictFile);
        AbstractSegmenter segmenter = this.i_format.equals("raw") ? EngineGetter.getSegmenter(this.s_language, tokenizer) : null;
        List<String[]> filenames = this.getFilenames(this.s_inputPath, this.s_inputExt, this.s_outputExt);
        boolean outLine = this.o_format.equals("line");
        tokenizer.setTwit(this.b_twit);
        try {
            for (String[] io : filenames) {
                System.out.println(io[0]);
                this.tokenize(tokenizer, segmenter, io[0], io[1], outLine);
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void tokenize(AbstractTokenizer tokenizer, AbstractSegmenter segmenter, String inputFile, String outputFile, boolean outLine) throws IOException {
        BufferedReader fin = UTInput.createBufferedFileReader(inputFile);
        PrintStream fout = UTOutput.createPrintBufferedFileStream(outputFile);
        if (segmenter == null) {
            String line;
            while ((line = fin.readLine()) != null) {
                this.print(fout, tokenizer.getTokens(line), outLine);
            }
        } else {
            for (List<String> tokens : segmenter.getSentences(fin)) {
                this.print(fout, tokens, outLine);
            }
        }
        fin.close();
        fout.close();
    }

    private void print(PrintStream fout, List<String> tokens, boolean outLine) {
        if (outLine) {
            fout.println(UTArray.join(tokens, " "));
        } else {
            fout.println(UTArray.join(tokens, "\n") + "\n");
        }
    }

    public static void main(String[] args) {
        new Tokenizer(args);
    }
}

