/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.crawl.MD5Signature;
import org.apache.nutch.crawl.Signature;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.StringUtil;

public class TextProfileSignature
extends Signature {
    Signature fallback = new MD5Signature();
    int MIN_TOKEN_LEN = 2;
    float QUANT_RATE = 0.01f;
    boolean secondaryLexicographicSorting = true;

    @Override
    public void setConf(Configuration conf) {
        super.setConf(conf);
        this.MIN_TOKEN_LEN = conf.getInt("db.signature.text_profile.min_token_len", 2);
        this.QUANT_RATE = conf.getFloat("db.signature.text_profile.quant_rate", 0.01f);
        this.secondaryLexicographicSorting = conf.getBoolean("db.signature.text_profile.sec_sort_lex", true);
    }

    @Override
    public byte[] calculate(Content content, Parse parse) {
        HashMap<String, Token> tokens = new HashMap<String, Token>();
        String text = null;
        if (parse != null) {
            text = parse.getText();
        }
        if (text == null || text.length() == 0) {
            return this.fallback.calculate(content, parse);
        }
        StringBuffer curToken = new StringBuffer();
        int maxFreq = 0;
        for (int i = 0; i < text.length(); ++i) {
            char c = text.charAt(i);
            if (Character.isLetterOrDigit(c)) {
                curToken.append(Character.toLowerCase(c));
                continue;
            }
            if (curToken.length() <= 0) continue;
            if (curToken.length() > this.MIN_TOKEN_LEN) {
                String s = curToken.toString();
                Token tok = (Token)tokens.get(s);
                if (tok == null) {
                    tok = new Token(0, s);
                    tokens.put(s, tok);
                }
                ++tok.cnt;
                if (tok.cnt > maxFreq) {
                    maxFreq = tok.cnt;
                }
            }
            curToken.setLength(0);
        }
        if (curToken.length() > this.MIN_TOKEN_LEN) {
            String s = curToken.toString();
            Token tok = (Token)tokens.get(s);
            if (tok == null) {
                tok = new Token(0, s);
                tokens.put(s, tok);
            }
            ++tok.cnt;
            if (tok.cnt > maxFreq) {
                maxFreq = tok.cnt;
            }
        }
        Iterator<Object> it = tokens.values().iterator();
        ArrayList<Token> profile = new ArrayList<Token>();
        int QUANT = Math.round((float)maxFreq * this.QUANT_RATE);
        if (QUANT < 2) {
            QUANT = maxFreq > 1 ? 2 : 1;
        }
        while (it.hasNext()) {
            Token t = (Token)it.next();
            t.cnt = t.cnt / QUANT * QUANT;
            if (t.cnt < QUANT) continue;
            profile.add(t);
        }
        Collections.sort(profile, new TokenComparator());
        StringBuffer newText = new StringBuffer();
        for (Token t : profile) {
            if (newText.length() > 0) {
                newText.append("\n");
            }
            newText.append(t.toString());
        }
        return MD5Hash.digest((String)newText.toString()).getDigest();
    }

    public static void main(String[] args) throws Exception {
        TextProfileSignature sig = new TextProfileSignature();
        sig.setConf(NutchConfiguration.create());
        HashMap<String, byte[]> res = new HashMap<String, byte[]>();
        File[] files = new File(args[0]).listFiles();
        for (int i = 0; i < files.length; ++i) {
            FileInputStream fis = new FileInputStream(files[i]);
            BufferedReader br = new BufferedReader(new InputStreamReader((InputStream)fis, "UTF-8"));
            StringBuffer text = new StringBuffer();
            String line = null;
            while ((line = br.readLine()) != null) {
                if (text.length() > 0) {
                    text.append("\n");
                }
                text.append(line);
            }
            br.close();
            byte[] signature = sig.calculate(null, new ParseImpl(text.toString(), null));
            res.put(files[i].toString(), signature);
        }
        for (String name : res.keySet()) {
            byte[] signature = (byte[])res.get(name);
            System.out.println(name + "\t" + StringUtil.toHexString(signature));
        }
    }

    private class TokenComparator
    implements Comparator<Token> {
        private TokenComparator() {
        }

        @Override
        public int compare(Token t1, Token t2) {
            int diffCnt = t2.cnt - t1.cnt;
            if (diffCnt == 0 && TextProfileSignature.this.secondaryLexicographicSorting) {
                return t1.val.compareTo(t2.val);
            }
            return diffCnt;
        }
    }

    private static class Token {
        public int cnt;
        public String val;

        public Token(int cnt, String val) {
            this.cnt = cnt;
            this.val = val;
        }

        public String toString() {
            return this.val + " " + this.cnt;
        }
    }
}

