Index: Evaluator/InlineEvaluator.h =================================================================== RCS file: /home/pooma/Repository/r2/src/Evaluator/InlineEvaluator.h,v retrieving revision 1.28 diff -u -u -r1.28 InlineEvaluator.h --- Evaluator/InlineEvaluator.h 22 Oct 2003 20:43:26 -0000 1.28 +++ Evaluator/InlineEvaluator.h 27 Nov 2003 20:57:35 -0000 @@ -149,6 +149,7 @@ LHS localLHS(lhs); RHS localRHS(rhs); int e0 = domain[0].length(); +#pragma omp parallel for if (e0 > 512) for (int i0=0; i0 512) for (int i0 = f0; i0 <= e0; ++i0) op(i0); } @@ -116,6 +117,7 @@ int f1 = domain[1].first(); int e0 = domain[0].last(); int e1 = domain[1].last(); +#pragma omp parallel for for (int i1 = f1; i1 <= e1; ++i1) for (int i0 = f0; i0 <= e0; ++i0) op(i0, i1); @@ -131,6 +133,7 @@ int e0 = domain[0].last(); int e1 = domain[1].last(); int e2 = domain[2].last(); +#pragma omp parallel for for (int i2 = f2; i2 <= e2; ++i2) for (int i1 = f1; i1 <= e1; ++i1) for (int i0 = f0; i0 <= e0; ++i0) @@ -149,6 +152,7 @@ int e1 = domain[1].last(); int e2 = domain[2].last(); int e3 = domain[3].last(); +#pragma omp parallel for for (int i3 = f3; i3 <= e3; ++i3) for (int i2 = f2; i2 <= e2; ++i2) for (int i1 = f1; i1 <= e1; ++i1) @@ -170,6 +174,7 @@ int e2 = domain[2].last(); int e3 = domain[3].last(); int e4 = domain[4].last(); +#pragma omp parallel for for (int i4 = f4; i4 <= e4; ++i4) for (int i3 = f3; i3 <= e3; ++i3) for (int i2 = f2; i2 <= e2; ++i2) @@ -194,6 +199,7 @@ int e3 = domain[3].last(); int e4 = domain[4].last(); int e5 = domain[5].last(); +#pragma omp parallel for for (int i5 = f5; i5 <= e5; ++i5) for (int i4 = f4; i4 <= e4; ++i4) for (int i3 = f3; i3 <= e3; ++i3) @@ -221,6 +227,7 @@ int e4 = domain[4].last(); int e5 = domain[5].last(); int e6 = domain[6].last(); +#pragma omp parallel for for (int i6 = f6; i6 <= e6; ++i6) for (int i5 = f5; i5 <= e5; ++i5) for (int i4 = f4; i4 <= e4; ++i4) Index: Evaluator/ReductionEvaluator.h =================================================================== RCS file: /home/pooma/Repository/r2/src/Evaluator/ReductionEvaluator.h,v retrieving revision 1.9 diff -u -u -r1.9 ReductionEvaluator.h --- Evaluator/ReductionEvaluator.h 29 Oct 2003 20:13:27 -0000 1.9 +++ Evaluator/ReductionEvaluator.h 27 Nov 2003 20:57:36 -0000 @@ -108,6 +108,56 @@ }; +/** + * Class to hold static array for partial reduction results + * and routine for final reduction. Two versions, one dummy + * for non-OpenMP, one for OpenMP operation. + */ + +#ifndef _OPENMP +template +struct PartialReduction { + static inline void init() {} + inline void storePartialResult(const T& result) + { + answer = result; + } + template + inline void reduce(T& ret, const Op&) + { + ret = answer; + } + T answer; +}; +#else +template +struct PartialReduction { + static inline void init() + { + if (!answer) + answer = new T[omp_get_max_threads()]; + } + inline void storePartialResult(const T& result) + { + int n = omp_get_thread_num(); + answer[n] = result; + if (n == 0) + num_threads = omp_get_num_threads(); + } + template + inline void reduce(T& ret, const Op& op) + { + T res = answer[0]; + for (int i = 1; i +T *PartialReduction::answer = NULL; +#endif //----------------------------------------------------------------------------- @@ -130,6 +180,7 @@ template<> struct ReductionEvaluator { + //--------------------------------------------------------------------------- // Input an expression and cause it to be evaluated. // All this template function does is extract the domain @@ -139,6 +190,7 @@ inline static void evaluate(T &ret, const Op &op, const Expr &e) { typedef typename Expr::Domain_t Domain_t; + PartialReduction::init(); evaluate(ret, op, e, e.domain(), WrappedInt()); } @@ -171,7 +223,7 @@ // // NOTE: These loops assume that the domain passed in is a unit-stride // domain starting at 0. Assertions are made to make sure this is true. - + template inline static void evaluate(T &ret, const Op &op, const Expr &e, const Domain &domain, WrappedInt<1>) @@ -181,9 +233,16 @@ Expr localExpr(e); int e0 = domain[0].length(); - T answer = ReductionTraits::identity(); - for (int i0 = 0; i0 < e0; ++i0) - op(answer, localExpr.read(i0)); + PartialReduction reduction; +#pragma omp parallel if (e0 > 512) + { + T answer = ReductionTraits::identity(); +#pragma omp for nowait + for (int i0 = 0; i0 < e0; ++i0) + op(answer, localExpr.read(i0)); + reduction.storePartialResult(answer); + } + reduction.reduce(ret, op); ret = answer; } @@ -199,12 +258,17 @@ int e0 = domain[0].length(); int e1 = domain[1].length(); - T answer = ReductionTraits::identity(); - for (int i1 = 0; i1 < e1; ++i1) - for (int i0 = 0; i0 < e0; ++i0) - op(answer, localExpr.read(i0, i1)); - - ret = answer; + PartialReduction reduction; +#pragma omp parallel + { + T answer = ReductionTraits::identity(); +#pragma omp for nowait + for (int i1 = 0; i1 < e1; ++i1) + for (int i0 = 0; i0 < e0; ++i0) + op(answer, localExpr.read(i0, i1)); + reduction.storePartialResult(answer); + } + reduction.reduce(ret, op); } template @@ -220,13 +284,18 @@ int e1 = domain[1].length(); int e2 = domain[2].length(); - T answer = ReductionTraits::identity(); - for (int i2 = 0; i2 < e2; ++i2) - for (int i1 = 0; i1 < e1; ++i1) - for (int i0 = 0; i0 < e0; ++i0) - op(answer, localExpr.read(i0, i1, i2)); - - ret = answer; + PartialReduction reduction; +#pragma omp parallel + { + T answer = ReductionTraits::identity(); +#pragma omp for nowait + for (int i2 = 0; i2 < e2; ++i2) + for (int i1 = 0; i1 < e1; ++i1) + for (int i0 = 0; i0 < e0; ++i0) + op(answer, localExpr.read(i0, i1, i2)); + reduction.storePartialResult(answer); + } + reduction.reduce(ret, op); } template @@ -244,14 +313,19 @@ int e2 = domain[2].length(); int e3 = domain[3].length(); - T answer = ReductionTraits::identity(); - for (int i3 = 0; i3 < e3; ++i3) - for (int i2 = 0; i2 < e2; ++i2) - for (int i1 = 0; i1 < e1; ++i1) - for (int i0 = 0; i0 < e0; ++i0) - op(answer, localExpr.read(i0, i1, i2, i3)); - - ret = answer; + PartialReduction reduction; +#pragma omp parallel + { + T answer = ReductionTraits::identity(); +#pragma omp for nowait + for (int i3 = 0; i3 < e3; ++i3) + for (int i2 = 0; i2 < e2; ++i2) + for (int i1 = 0; i1 < e1; ++i1) + for (int i0 = 0; i0 < e0; ++i0) + op(answer, localExpr.read(i0, i1, i2, i3)); + reduction.storePartialResult(answer); + } + reduction.reduce(ret, op); } template @@ -271,15 +345,20 @@ int e3 = domain[3].length(); int e4 = domain[4].length(); - T answer = ReductionTraits::identity(); - for (int i4 = 0; i4 < e4; ++i4) - for (int i3 = 0; i3 < e3; ++i3) - for (int i2 = 0; i2 < e2; ++i2) - for (int i1 = 0; i1 < e1; ++i1) - for (int i0 = 0; i0 < e0; ++i0) - op(answer, localExpr.read(i0, i1, i2, i3, i4)); - - ret = answer; + PartialReduction reduction; +#pragma omp parallel + { + T answer = ReductionTraits::identity(); +#pragma omp for nowait + for (int i4 = 0; i4 < e4; ++i4) + for (int i3 = 0; i3 < e3; ++i3) + for (int i2 = 0; i2 < e2; ++i2) + for (int i1 = 0; i1 < e1; ++i1) + for (int i0 = 0; i0 < e0; ++i0) + op(answer, localExpr.read(i0, i1, i2, i3, i4)); + reduction.storePartialResult(answer); + } + reduction.reduce(ret, op); } template @@ -301,16 +380,21 @@ int e4 = domain[4].length(); int e5 = domain[5].length(); - T answer = ReductionTraits::identity(); - for (int i5 = 0; i5 < e5; ++i5) - for (int i4 = 0; i4 < e4; ++i4) - for (int i3 = 0; i3 < e3; ++i3) - for (int i2 = 0; i2 < e2; ++i2) - for (int i1 = 0; i1 < e1; ++i1) - for (int i0 = 0; i0 < e0; ++i0) - op(answer, localExpr.read(i0, i1, i2, i3, i4, i5)); - - ret = answer; + PartialReduction reduction; +#pragma omp parallel + { + T answer = ReductionTraits::identity(); +#pragma omp for nowait + for (int i5 = 0; i5 < e5; ++i5) + for (int i4 = 0; i4 < e4; ++i4) + for (int i3 = 0; i3 < e3; ++i3) + for (int i2 = 0; i2 < e2; ++i2) + for (int i1 = 0; i1 < e1; ++i1) + for (int i0 = 0; i0 < e0; ++i0) + op(answer, localExpr.read(i0, i1, i2, i3, i4, i5)); + reduction.storePartialResult(answer); + } + reduction.reduce(ret, op); } template @@ -334,17 +418,22 @@ int e5 = domain[5].length(); int e6 = domain[6].length(); - T answer = ReductionTraits::identity(); - for (int i6 = 0; i6 < e6; ++i6) - for (int i5 = 0; i5 < e5; ++i5) - for (int i4 = 0; i4 < e4; ++i4) - for (int i3 = 0; i3 < e3; ++i3) - for (int i2 = 0; i2 < e2; ++i2) - for (int i1 = 0; i1 < e1; ++i1) - for (int i0 = 0; i0 < e0; ++i0) - op(answer, localExpr.read(i0, i1, i2, i3, i4, i5, i6)); - - ret = answer; + PartialReduction reduction; +#pragma omp parallel + { + T answer = ReductionTraits::identity(); +#pragma omp for nowait + for (int i6 = 0; i6 < e6; ++i6) + for (int i5 = 0; i5 < e5; ++i5) + for (int i4 = 0; i4 < e4; ++i4) + for (int i3 = 0; i3 < e3; ++i3) + for (int i2 = 0; i2 < e2; ++i2) + for (int i1 = 0; i1 < e1; ++i1) + for (int i0 = 0; i0 < e0; ++i0) + op(answer, localExpr.read(i0, i1, i2, i3, i4, i5, i6)); + reduction.storePartialResult(answer); + } + reduction.reduce(ret, op); } };