/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysds.runtime.compress.lib;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import org.apache.commons.lang.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
import org.apache.sysds.runtime.compress.DMLCompressionException;
import org.apache.sysds.runtime.compress.colgroup.AColGroup;
import org.apache.sysds.runtime.compress.colgroup.AColGroupCompressed;
import org.apache.sysds.runtime.compress.colgroup.ASDCZero;
import org.apache.sysds.runtime.compress.colgroup.ColGroupConst;
import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
import org.apache.sysds.runtime.compress.lib.CLALibUtils;
import org.apache.sysds.runtime.controlprogram.parfor.stat.Timing;
import org.apache.sysds.runtime.data.DenseBlock;
import org.apache.sysds.runtime.data.SparseBlock;
import org.apache.sysds.runtime.functionobjects.Builtin;
import org.apache.sysds.runtime.functionobjects.IndexFunction;
import org.apache.sysds.runtime.functionobjects.KahanFunction;
import org.apache.sysds.runtime.functionobjects.KahanPlus;
import org.apache.sysds.runtime.functionobjects.KahanPlusSq;
import org.apache.sysds.runtime.functionobjects.Mean;
import org.apache.sysds.runtime.functionobjects.Multiply;
import org.apache.sysds.runtime.functionobjects.Plus;
import org.apache.sysds.runtime.functionobjects.ReduceAll;
import org.apache.sysds.runtime.functionobjects.ReduceCol;
import org.apache.sysds.runtime.functionobjects.ReduceRow;
import org.apache.sysds.runtime.functionobjects.ValueFunction;
import org.apache.sysds.runtime.matrix.data.LibMatrixAgg;
import org.apache.sysds.runtime.matrix.data.LibMatrixBincell;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
import org.apache.sysds.runtime.matrix.data.MatrixIndexes;
import org.apache.sysds.runtime.matrix.data.MatrixValue;
import org.apache.sysds.runtime.matrix.operators.AggregateOperator;
import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
import org.apache.sysds.runtime.util.CommonThreadPool;
import org.apache.sysds.utils.DMLCompressionStatistics;

public class CLALibCompAgg {
    private static final Log LOG = LogFactory.getLog((String)CLALibCompAgg.class.getName());
    private static final long MIN_PAR_AGG_THRESHOLD = 8192L;

    public static MatrixBlock aggregateUnary(CompressedMatrixBlock inputMatrix, MatrixBlock result, AggregateUnaryOperator op, int blen, MatrixIndexes indexesIn, boolean inCP) {
        if (!CLALibCompAgg.supported(op) || inputMatrix.isEmpty()) {
            return inputMatrix.getUncompressed("Unary aggregate " + op + " not supported yet.", op.getNumThreads()).aggregateUnaryOperations(op, result, blen, indexesIn, inCP);
        }
        int r = inputMatrix.getNumRows();
        int c = inputMatrix.getNumColumns();
        List<AColGroup> colGroups = inputMatrix.getColGroups();
        boolean requireDecompress = CLALibCompAgg.requireDecompression(inputMatrix, op);
        if (requireDecompress) {
            LOG.trace((Object)"Require decompression in unaryAggregate");
            if (inputMatrix.getCachedDecompressed() != null) {
                return inputMatrix.getCachedDecompressed().aggregateUnaryOperations(op, result, blen, indexesIn, inCP);
            }
        }
        MatrixValue.CellIndex tempCellIndex = new MatrixValue.CellIndex(-1, -1);
        op.indexFn.computeDimension(r, c, tempCellIndex);
        if (result == null) {
            result = new MatrixBlock(tempCellIndex.row, tempCellIndex.column, false);
        } else {
            result.reset(tempCellIndex.row, tempCellIndex.column, false);
        }
        result.allocateDenseBlock();
        AggregateUnaryOperator opm = CLALibCompAgg.replaceKahnOperations(op);
        if (colGroups != null) {
            CLALibCompAgg.fillStart(inputMatrix, result, opm);
            if (requireDecompress) {
                CLALibCompAgg.aggOverlapping(inputMatrix, result, opm, indexesIn, inCP);
            } else {
                CLALibCompAgg.agg(inputMatrix, result, opm, blen, indexesIn, inCP);
            }
        }
        result.recomputeNonZeros();
        if (op.aggOp.existsCorrection() && !inCP) {
            result = CLALibCompAgg.addCorrection(result, op);
            if (op.aggOp.increOp.fn instanceof Mean) {
                result = CLALibCompAgg.addCellCount(result, op, r, c);
            }
        }
        return result;
    }

    private static boolean supported(AggregateUnaryOperator op) {
        ValueFunction fn = op.aggOp.increOp.fn;
        if (fn instanceof Builtin) {
            Builtin.BuiltinCode b = ((Builtin)fn).getBuiltinCode();
            return b == Builtin.BuiltinCode.MIN || b == Builtin.BuiltinCode.MAX;
        }
        return fn instanceof KahanPlus || fn instanceof KahanPlusSq || fn instanceof Mean || fn instanceof Multiply && op.indexFn instanceof ReduceAll;
    }

    private static boolean requireDecompression(CompressedMatrixBlock inputMatrix, AggregateUnaryOperator op) {
        if (inputMatrix.isOverlapping()) {
            ValueFunction fn = op.aggOp.increOp.fn;
            if (fn instanceof Builtin) {
                Builtin.BuiltinCode b = ((Builtin)fn).getBuiltinCode();
                return b == Builtin.BuiltinCode.MIN || b == Builtin.BuiltinCode.MAX;
            }
            return fn instanceof KahanPlusSq || fn instanceof Multiply;
        }
        return false;
    }

    private static MatrixBlock addCorrection(MatrixBlock ret, AggregateUnaryOperator op) {
        switch (op.aggOp.correction) {
            case LASTCOLUMN: {
                MatrixBlock resWithCorrection = new MatrixBlock(ret.getNumRows(), ret.getNumColumns() + 1, false);
                resWithCorrection.allocateDenseBlock();
                for (int i = 0; i < ret.getNumRows(); ++i) {
                    resWithCorrection.setValue(i, 0, ret.quickGetValue(i, 0));
                }
                return resWithCorrection;
            }
            case LASTROW: {
                MatrixBlock resWithCorrection = new MatrixBlock(ret.getNumRows() + 1, ret.getNumColumns(), false);
                resWithCorrection.allocateDenseBlock();
                for (int i = 0; i < ret.getNumColumns(); ++i) {
                    resWithCorrection.setValue(0, i, ret.quickGetValue(0, i));
                }
                return resWithCorrection;
            }
            case LASTTWOCOLUMNS: {
                MatrixBlock resWithCorrection = new MatrixBlock(ret.getNumRows(), ret.getNumColumns() + 2, false);
                resWithCorrection.allocateDenseBlock();
                for (int i = 0; i < ret.getNumRows(); ++i) {
                    resWithCorrection.setValue(i, 0, ret.quickGetValue(i, 0));
                }
                return resWithCorrection;
            }
            case LASTTWOROWS: {
                MatrixBlock resWithCorrection = new MatrixBlock(ret.getNumRows() + 2, ret.getNumColumns(), false);
                resWithCorrection.allocateDenseBlock();
                for (int i = 0; i < ret.getNumColumns(); ++i) {
                    resWithCorrection.setValue(0, i, ret.quickGetValue(0, i));
                }
                return resWithCorrection;
            }
            case NONE: {
                return ret;
            }
        }
        throw new NotImplementedException("Not implemented corrections of more than 2");
    }

    private static MatrixBlock addCellCount(MatrixBlock ret, AggregateUnaryOperator op, int nRow, int nCol) {
        if (op.indexFn instanceof ReduceAll) {
            ret.setValue(0, 1, (long)nRow * (long)nCol);
        } else if (op.indexFn instanceof ReduceCol) {
            for (int i = 0; i < nRow; ++i) {
                ret.setValue(i, 1, nCol);
            }
        } else {
            for (int i = 0; i < nCol; ++i) {
                ret.setValue(1, i, nRow);
            }
        }
        return ret;
    }

    private static AggregateUnaryOperator replaceKahnOperations(AggregateUnaryOperator op) {
        if (op.aggOp.increOp.fn instanceof KahanPlus) {
            return new AggregateUnaryOperator(new AggregateOperator(0.0, Plus.getPlusFnObject()), op.indexFn, op.getNumThreads());
        }
        return op;
    }

    private static void agg(CompressedMatrixBlock m, MatrixBlock o, AggregateUnaryOperator op, int blen, MatrixIndexes indexesIn, boolean inCP) {
        AggregateUnaryOperator opm;
        int k = op.getNumThreads();
        AggregateUnaryOperator aggregateUnaryOperator = opm = op.aggOp.increOp.fn instanceof Mean ? new AggregateUnaryOperator(new AggregateOperator(0.0, Plus.getPlusFnObject()), op.indexFn) : op;
        if (CLALibCompAgg.isValidForParallelProcessing(m, op)) {
            CLALibCompAgg.aggregateInParallel(m, o, opm, k);
        } else {
            int nRows = m.getNumRows();
            int nCol = m.getNumColumns();
            double[] ret = o.getDenseBlockValues();
            List<AColGroup> groups = m.getColGroups();
            if (op.indexFn instanceof ReduceCol) {
                CLALibCompAgg.agg(opm, groups, ret, nRows, 0, nRows, nCol, CLALibCompAgg.getPreAgg(opm, groups));
            } else {
                CLALibCompAgg.agg(opm, groups, ret, nRows, 0, nRows, nCol, null);
            }
        }
        if (op.aggOp.increOp.fn instanceof Mean) {
            CLALibCompAgg.divideByNumberOfCellsForMean(m, o, op.indexFn);
        }
    }

    private static boolean isValidForParallelProcessing(CompressedMatrixBlock m1, AggregateUnaryOperator op) {
        return op.getNumThreads() > 1 && m1.getExactSizeOnDisk() > 8192L;
    }

    private static void aggregateInParallel(CompressedMatrixBlock m1, MatrixBlock ret, AggregateUnaryOperator op, int k) {
        ExecutorService pool = CommonThreadPool.get(k);
        ArrayList<UnaryAggregateTask> tasks = new ArrayList<UnaryAggregateTask>();
        int r = m1.getNumRows();
        int c = m1.getNumColumns();
        List<AColGroup> colGroups = m1.getColGroups();
        try {
            if (op.indexFn instanceof ReduceCol) {
                int blkz = 65535;
                int blklen = Math.max((int)Math.ceil((double)r / (double)(k * 2)), 65535);
                double[][] preAgg = CLALibCompAgg.getPreAgg(op, colGroups);
                for (int i = 0; i < r; i += blklen) {
                    tasks.add(new UnaryAggregateTask(colGroups, ret, r, i, Math.min(i + blklen, r), op, c, false, preAgg));
                }
            } else {
                for (List<AColGroup> grp : CLALibCompAgg.createTaskPartition(colGroups, k)) {
                    tasks.add(new UnaryAggregateTask(grp, ret, r, 0, r, op, c, m1.isOverlapping(), null));
                }
            }
            List<Future<MatrixBlock>> futures = pool.invokeAll(tasks);
            CLALibCompAgg.reduceFutures(futures, ret, op, m1.isOverlapping());
        }
        catch (InterruptedException | ExecutionException e) {
            pool.shutdown();
            throw new DMLRuntimeException("Aggregate In parallel failed.", e);
        }
        pool.shutdown();
    }

    private static double[][] getPreAgg(AggregateUnaryOperator opm, List<AColGroup> groups) {
        double[][] ret = new double[groups.size()][];
        for (int i = 0; i < groups.size(); ++i) {
            AColGroup g = groups.get(i);
            if (!(g instanceof AColGroupCompressed)) continue;
            ret[i] = ((AColGroupCompressed)g).preAggRows(opm);
        }
        return ret;
    }

    private static void sumResults(MatrixBlock ret, List<Future<MatrixBlock>> futures) throws InterruptedException, ExecutionException {
        double val = ret.quickGetValue(0, 0);
        for (Future<MatrixBlock> rtask : futures) {
            double tmp = rtask.get().quickGetValue(0, 0);
            val += tmp;
        }
        ret.quickSetValue(0, 0, val);
    }

    private static void productResults(MatrixBlock ret, List<Future<MatrixBlock>> futures) throws InterruptedException, ExecutionException {
        double val = ret.quickGetValue(0, 0);
        for (Future<MatrixBlock> rtask : futures) {
            double tmp = rtask.get().quickGetValue(0, 0);
            if (tmp == 0.0) {
                ret.quickSetValue(0, 0, 0.0);
                return;
            }
            val *= tmp;
        }
        ret.quickSetValue(0, 0, val);
    }

    private static void aggregateResults(MatrixBlock ret, List<Future<MatrixBlock>> futures, AggregateUnaryOperator op) throws InterruptedException, ExecutionException {
        double val = ret.quickGetValue(0, 0);
        for (Future<MatrixBlock> rtask : futures) {
            double tmp = rtask.get().quickGetValue(0, 0);
            val = op.aggOp.increOp.fn.execute(val, tmp);
        }
        ret.quickSetValue(0, 0, val);
    }

    private static void divideByNumberOfCellsForMean(CompressedMatrixBlock m1, MatrixBlock ret, IndexFunction idxFn) {
        if (idxFn instanceof ReduceAll) {
            CLALibCompAgg.divideByNumberOfCellsForMeanAll(m1, ret);
        } else if (idxFn instanceof ReduceCol) {
            CLALibCompAgg.divideByNumberOfCellsForMeanRows(m1, ret);
        } else {
            CLALibCompAgg.divideByNumberOfCellsForMeanCols(m1, ret);
        }
    }

    private static void divideByNumberOfCellsForMeanRows(CompressedMatrixBlock m1, MatrixBlock ret) {
        double[] values = ret.getDenseBlockValues();
        for (int i = 0; i < m1.getNumRows(); ++i) {
            values[i] = values[i] / (double)m1.getNumColumns();
        }
    }

    private static void divideByNumberOfCellsForMeanCols(CompressedMatrixBlock m1, MatrixBlock ret) {
        double div = m1.getNumRows();
        if (ret.isInSparseFormat()) {
            SparseBlock sb = ret.getSparseBlock();
            if (sb.isEmpty(0)) {
                return;
            }
            double[] vals = sb.values(0);
            int i = 0;
            while (i < vals.length) {
                int n = i++;
                vals[n] = vals[n] / div;
            }
        } else {
            double[] vals = ret.getDenseBlockValues();
            int i = 0;
            while (i < vals.length) {
                int n = i++;
                vals[n] = vals[n] / div;
            }
        }
    }

    private static void divideByNumberOfCellsForMeanAll(CompressedMatrixBlock m1, MatrixBlock ret) {
        ret.quickSetValue(0, 0, ret.quickGetValue(0, 0) / (double)((long)m1.getNumColumns() * (long)m1.getNumRows()));
    }

    private static void aggOverlapping(CompressedMatrixBlock m1, MatrixBlock ret, AggregateUnaryOperator op, MatrixIndexes indexesIn, boolean inCP) {
        try {
            List<Future<MatrixBlock>> rtasks = CLALibCompAgg.generateUnaryAggregateOverlappingFutures(m1, ret, op);
            CLALibCompAgg.reduceFutures(rtasks, ret, op, true);
        }
        catch (InterruptedException | ExecutionException e) {
            throw new DMLCompressionException("Error in Compressed Unary Aggregate", e);
        }
    }

    private static void reduceFutures(List<Future<MatrixBlock>> futures, MatrixBlock ret, AggregateUnaryOperator op, boolean overlapping) throws InterruptedException, ExecutionException {
        if (CLALibCompAgg.isReduceAll(ret, op.indexFn)) {
            CLALibCompAgg.reduceAllFutures(futures, ret, op);
        } else if (op.indexFn instanceof ReduceRow && overlapping) {
            boolean isPlus = op.aggOp.increOp.fn instanceof KahanFunction || op.aggOp.increOp.fn instanceof Mean;
            BinaryOperator bop = isPlus ? new BinaryOperator(Plus.getPlusFnObject()) : op.aggOp.increOp;
            for (Future<MatrixBlock> rtask : futures) {
                LibMatrixBincell.bincellOpInPlace(ret, rtask.get(), bop);
            }
        } else {
            for (Future<MatrixBlock> rtask : futures) {
                rtask.get();
            }
        }
    }

    private static boolean isReduceAll(MatrixBlock ret, IndexFunction idxFn) {
        return idxFn instanceof ReduceAll || ret.getNumColumns() == 1 && ret.getNumRows() == 1;
    }

    private static void reduceAllFutures(List<Future<MatrixBlock>> futures, MatrixBlock ret, AggregateUnaryOperator op) throws InterruptedException, ExecutionException {
        if (op.aggOp.increOp.fn instanceof Builtin) {
            CLALibCompAgg.aggregateResults(ret, futures, op);
        } else if (op.aggOp.increOp.fn instanceof Multiply) {
            CLALibCompAgg.productResults(ret, futures);
        } else {
            CLALibCompAgg.sumResults(ret, futures);
        }
    }

    private static List<Future<MatrixBlock>> generateUnaryAggregateOverlappingFutures(CompressedMatrixBlock m1, MatrixBlock ret, AggregateUnaryOperator op) throws InterruptedException {
        int k = op.getNumThreads();
        ExecutorService pool = CommonThreadPool.get(k);
        ArrayList<UAOverlappingTask> tasks = new ArrayList<UAOverlappingTask>();
        int nCol = m1.getNumColumns();
        int nRow = m1.getNumRows();
        int blklen = Math.max(512, nRow / k);
        List<AColGroup> groups = m1.getColGroups();
        boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
        if (shouldFilter) {
            double[] constV = new double[nCol];
            List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
            AColGroup cRet = ColGroupConst.create(constV);
            filteredGroups.add(cRet);
            for (int i = 0; i < nRow; i += blklen) {
                tasks.add(new UAOverlappingTask(filteredGroups, ret, i, Math.min(i + blklen, nRow), op, nCol));
            }
        } else {
            for (int i = 0; i < nRow; i += blklen) {
                tasks.add(new UAOverlappingTask(groups, ret, i, Math.min(i + blklen, nRow), op, nCol));
            }
        }
        List<Future<MatrixBlock>> futures = pool.invokeAll(tasks);
        pool.shutdown();
        return futures;
    }

    private static List<List<AColGroup>> createTaskPartition(List<AColGroup> colGroups, int k) {
        int numTasks = Math.min(k, colGroups.size());
        ArrayList<List<AColGroup>> grpParts = new ArrayList<List<AColGroup>>();
        for (int i = 0; i < numTasks; ++i) {
            grpParts.add(new ArrayList());
        }
        int pos = 0;
        for (AColGroup grp : colGroups) {
            ((List)grpParts.get(pos)).add(grp);
            pos = (pos + 1) % numTasks;
        }
        return grpParts;
    }

    private static void agg(AggregateUnaryOperator op, List<AColGroup> groups, double[] ret, int nRows, int rl, int ru, int numColumns, double[][] preAgg) {
        if (op.indexFn instanceof ReduceCol) {
            CLALibCompAgg.aggRow(op, groups, ret, nRows, rl, ru, numColumns, preAgg);
        } else {
            CLALibCompAgg.aggColOrAll(op, groups, ret, nRows, rl, ru, numColumns);
        }
    }

    private static void aggColOrAll(AggregateUnaryOperator op, List<AColGroup> groups, double[] ret, int nRows, int rl, int ru, int numColumns) {
        for (AColGroup grp : groups) {
            grp.unaryAggregateOperations(op, ret, nRows, rl, ru);
        }
    }

    private static void aggRow(AggregateUnaryOperator op, List<AColGroup> groups, double[] ret, int nRows, int rl, int ru, int numColumns, double[][] preAgg) {
        for (int i = 0; i < groups.size(); ++i) {
            AColGroup grp = groups.get(i);
            if (grp instanceof AColGroupCompressed) {
                ((AColGroupCompressed)grp).unaryAggregateOperations(op, ret, nRows, rl, ru, preAgg[i]);
                continue;
            }
            grp.unaryAggregateOperations(op, ret, nRows, rl, ru);
        }
    }

    private static void fillStart(MatrixBlock in, MatrixBlock ret, AggregateUnaryOperator op) {
        ValueFunction fn = op.aggOp.increOp.fn;
        if (fn instanceof Builtin) {
            Double val = null;
            switch (((Builtin)fn).getBuiltinCode()) {
                case MAX: {
                    val = Double.NEGATIVE_INFINITY;
                    break;
                }
                case MIN: {
                    val = Double.POSITIVE_INFINITY;
                    break;
                }
            }
            if (val != null) {
                ret.getDenseBlock().set(val);
            }
        }
        if (fn instanceof Multiply) {
            long nc;
            boolean containsZero;
            long nnz = in.getNonZeros();
            boolean bl = containsZero = nnz != (nc = (long)in.getNumRows() * (long)in.getNumColumns());
            if (op.indexFn instanceof ReduceAll) {
                ret.setValue(0, 0, containsZero ? 0.0 : 1.0);
            } else {
                throw new NotImplementedException();
            }
        }
    }

    protected static MatrixBlock genTmpReduceAllOrRow(MatrixBlock ret, AggregateUnaryOperator op) {
        int c = ret.getNumColumns();
        MatrixBlock tmp = new MatrixBlock(1, c, false);
        tmp.allocateDenseBlock();
        if (op.aggOp.increOp.fn instanceof Builtin || op.aggOp.increOp.fn instanceof Multiply) {
            System.arraycopy(ret.getDenseBlockValues(), 0, tmp.getDenseBlockValues(), 0, c);
        }
        return tmp;
    }

    private static class UAOverlappingTask
    implements Callable<MatrixBlock> {
        private final List<AColGroup> _groups;
        private final int _rl;
        private final int _ru;
        private final int _blklen;
        private final MatrixBlock _ret;
        private final AggregateUnaryOperator _op;
        private final int _nCol;

        protected UAOverlappingTask(List<AColGroup> filteredGroups, MatrixBlock ret, int rl, int ru, AggregateUnaryOperator op, int nCol) {
            this._groups = filteredGroups;
            this._op = op;
            this._rl = rl;
            this._ru = ru;
            this._blklen = Math.max(131072 / ret.getNumColumns() / filteredGroups.size(), 64);
            this._ret = ret;
            this._nCol = nCol;
        }

        private MatrixBlock getTmp() {
            MatrixBlock tmp = new MatrixBlock(Math.min(this._ru - this._rl, this._blklen), this._nCol, false);
            tmp.allocateDenseBlock();
            return tmp;
        }

        private MatrixBlock decompressToTemp(MatrixBlock tmp, int rl, int ru, AIterator[] its) {
            Timing time = new Timing(true);
            DenseBlock db = tmp.getDenseBlock();
            for (int i = 0; i < this._groups.size(); ++i) {
                AColGroup g = this._groups.get(i);
                if (g instanceof ASDCZero) {
                    ((ASDCZero)g).decompressToDenseBlock(db, rl, ru, -rl, 0, its[i]);
                    continue;
                }
                g.decompressToDenseBlock(db, rl, ru, -rl, 0);
            }
            tmp.setNonZeros(rl + ru);
            if (DMLScript.STATISTICS) {
                double t = time.stop();
                DMLCompressionStatistics.addDecompressToBlockTime(t, 1);
                if (LOG.isTraceEnabled()) {
                    LOG.trace((Object)("decompressed block w/ k=1 in " + t + "ms."));
                }
            }
            return tmp;
        }

        @Override
        public MatrixBlock call() {
            MatrixBlock tmp = this.getTmp();
            ValueFunction fn = this._op.aggOp.increOp.fn;
            boolean isBinaryOp = false;
            if (fn instanceof Builtin) {
                Builtin.BuiltinCode b = ((Builtin)fn).getBuiltinCode();
                isBinaryOp = b == Builtin.BuiltinCode.MIN || b == Builtin.BuiltinCode.MAX;
            }
            AIterator[] its = new AIterator[this._groups.size()];
            for (int i = 0; i < this._groups.size(); ++i) {
                if (!(this._groups.get(i) instanceof ASDCZero)) continue;
                its[i] = ((ASDCZero)this._groups.get(i)).getIterator(this._rl);
            }
            if (this._op.indexFn instanceof ReduceCol) {
                for (int r = this._rl; r < this._ru; r += this._blklen) {
                    double[] retValues;
                    int rbu = Math.min(r + this._blklen, this._ru);
                    tmp.reset(rbu - r, tmp.getNumColumns(), false);
                    this.decompressToTemp(tmp, r, rbu, its);
                    MatrixBlock tmpR = tmp.prepareAggregateUnaryOutput(this._op, null, 1000);
                    LibMatrixAgg.aggregateUnaryMatrix(tmp, tmpR, this._op);
                    tmpR.dropLastRowsOrColumns(this._op.aggOp.correction);
                    if (tmpR.isEmpty()) {
                        if (!isBinaryOp) continue;
                        retValues = this._ret.getDenseBlockValues();
                        int s = r * this._ret.getNumColumns();
                        int e = rbu * this._ret.getNumColumns();
                        Arrays.fill(retValues, s, e, 0.0);
                        continue;
                    }
                    if (tmpR.isInSparseFormat()) {
                        throw new NotImplementedException("Not supported Sparse yet and it should be extremely unlikely/not happen. because we work with a single column here");
                    }
                    retValues = this._ret.getDenseBlockValues();
                    double[] tmpRValues = tmpR.getDenseBlockValues();
                    int currentIndex = r * this._ret.getNumColumns();
                    int length = rbu - r;
                    System.arraycopy(tmpRValues, 0, retValues, currentIndex, length);
                }
                return null;
            }
            if (this._op.indexFn instanceof ReduceAll) {
                this.decompressToTemp(tmp, this._rl, this._ru, its);
                MatrixBlock outputBlock = tmp.prepareAggregateUnaryOutput(this._op, null, 1000);
                LibMatrixAgg.aggregateUnaryMatrix(tmp, outputBlock, this._op);
                outputBlock.dropLastRowsOrColumns(this._op.aggOp.correction);
                return outputBlock;
            }
            this.decompressToTemp(tmp, this._rl, this._ru, its);
            MatrixBlock outputBlock = tmp.prepareAggregateUnaryOutput(this._op, null, 1000);
            LibMatrixAgg.aggregateUnaryMatrix(tmp, outputBlock, this._op);
            outputBlock.dropLastRowsOrColumns(this._op.aggOp.correction);
            return outputBlock;
        }
    }

    private static class UnaryAggregateTask
    implements Callable<MatrixBlock> {
        private final List<AColGroup> _groups;
        private final int _nRows;
        private final int _rl;
        private final int _ru;
        private final MatrixBlock _ret;
        private final int _numColumns;
        private final AggregateUnaryOperator _op;
        private final boolean _overlapping;
        private final double[][] _preAgg;

        protected UnaryAggregateTask(List<AColGroup> groups, MatrixBlock ret, int nRows, int rl, int ru, AggregateUnaryOperator op, int numColumns, boolean overlapping, double[][] preAgg) {
            this._groups = groups;
            this._op = op;
            this._nRows = nRows;
            this._rl = rl;
            this._ru = ru;
            this._numColumns = numColumns;
            this._preAgg = preAgg;
            this._ret = ret;
            this._overlapping = overlapping;
        }

        @Override
        public MatrixBlock call() {
            boolean overlappingRows;
            MatrixBlock ret = this._ret;
            boolean bl = overlappingRows = this._op.indexFn instanceof ReduceRow && this._overlapping;
            if (this._op.indexFn instanceof ReduceAll || overlappingRows) {
                ret = CLALibCompAgg.genTmpReduceAllOrRow(ret, this._op);
            }
            CLALibCompAgg.agg(this._op, this._groups, ret.getDenseBlockValues(), this._nRows, this._rl, this._ru, this._numColumns, this._preAgg);
            if (overlappingRows) {
                ret.recomputeNonZeros();
            }
            return ret;
        }
    }
}

