/*
**
** PHiPAC Matrix-Matrix Code for the operation:
**    C = transpose(A)*B + C
**
** Automatically Generated by mm_cgen ($Revision: 1.27 $) using the command:
**    ./mm_cgen -prec double -opA T -opB N -alpha 1 -sp 1 -holdstripe B -l0 9 4 1 -file ./src/mm_double_TN_1_general.c -routine_name mm_double_TN_1_general 
**
** Run './mm_cgen -help' for help.
**
** Generated on: Wednesday July 10 2013, 08:33:28 PDT
** Created by: Jeff Bilmes <bilmes@cs.berkeley.edu>
**             http://www.icsi.berkeley.edu/~bilmes/phipac
**
**
** Routine Usage: General (M,K,N) = (M, K, N) matrix multiply
**    mm_double_TN_1_general(const int M, const int K, const int N, const double *const A, const double *const B, double *const C, const int Astride, const int Bstride, const int Cstride)
** where
**  transpose(A) is an MxK matrix
**  B is an KxN matrix
**  C is an MxN matrix
**  Astride is the number of entries between the start of each row of A
**  Bstride is the number of entries between the start of each row of B
**  Cstride is the number of entries between the start of each row of C
**
**
** "Copyright (c) 1995 The Regents of the University of California.  All
** rights reserved."  Permission to use, copy, modify, and distribute
** this software and its documentation for any purpose, without fee, and
** without written agreement is hereby granted, provided that the above
** copyright notice and the following two paragraphs appear in all copies
** of this software.
**
** IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
** DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
** OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF
** CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**
** THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
** INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
** AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
** ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO
** PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
**
*/

/*
 * General (M,K,N) = (M, K, N) matrix multiply
 */
void
mm_double_TN_1_general(const int M, const int K, const int N, const double *const A, const double *const B, double *const C, const int Astride, const int Bstride, const int Cstride)
{
   const double *a,*b;
   double *c;
   const double *ap;
   const double *bp;
   double *cp;
   const int C_sbs_stride = Cstride*9;
   const int k_marg_el = K & 3;
   const int k_norm = (K - k_marg_el)*Astride;
   const int m_marg_el = M % 9;
   const int m_norm = M - m_marg_el;
   const int n_marg_el = N & 0;
   const int n_norm = N - n_marg_el;
   double *const c_endp = C+m_norm*Cstride;
   register double c0_0,c1_0,c2_0,c3_0,c4_0,c5_0,c6_0,c7_0,c8_0;
   for (c=C,a=A; c!= c_endp; c+=C_sbs_stride,a+=9) {
      const double* const ap_endp = a + k_norm;
      double* const cp_endp = c + n_norm;
      for (b=B,cp=c; cp!=cp_endp; b+=1,cp+=1) {
         register double _b0;
         register double _a0,_a1,_a2,_a3,_a4,_a5,_a6,_a7,_a8;
         double *_cp;
         ap=a;
         bp=b;
         _cp=cp;c0_0=_cp[0];
         _cp+=Cstride;c1_0=_cp[0];
         _cp+=Cstride;c2_0=_cp[0];
         _cp+=Cstride;c3_0=_cp[0];
         _cp+=Cstride;c4_0=_cp[0];
         _cp+=Cstride;c5_0=_cp[0];
         _cp+=Cstride;c6_0=_cp[0];
         _cp+=Cstride;c7_0=_cp[0];
         _cp+=Cstride;c8_0=_cp[0];
         for (;ap!=ap_endp; ) {
            /* Fixed M,K,N = 9,4,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            _a8 = ap[8];
            c8_0 += _a8*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            _a8 = ap[8];
            c8_0 += _a8*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            _a8 = ap[8];
            c8_0 += _a8*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            _a8 = ap[8];
            c8_0 += _a8*_b0; 
            ap += Astride;

         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 9,2,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            _a8 = ap[8];
            c8_0 += _a8*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            _a8 = ap[8];
            c8_0 += _a8*_b0; 
            ap += Astride;

         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 9,1,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            _a8 = ap[8];
            c8_0 += _a8*_b0; 
            ap += Astride;

         }
         _cp=cp;_cp[0]=c0_0;
         _cp+=Cstride;_cp[0]=c1_0;
         _cp+=Cstride;_cp[0]=c2_0;
         _cp+=Cstride;_cp[0]=c3_0;
         _cp+=Cstride;_cp[0]=c4_0;
         _cp+=Cstride;_cp[0]=c5_0;
         _cp+=Cstride;_cp[0]=c6_0;
         _cp+=Cstride;_cp[0]=c7_0;
         _cp+=Cstride;_cp[0]=c8_0;
      }
   }
   if (m_marg_el & 0x8) {
      const double* const ap_endp = a + k_norm;
      double* const cp_endp = c + n_norm;
      for (b=B,cp=c; cp!=cp_endp; b+=1,cp+=1) {
         register double _b0;
         register double _a0,_a1,_a2,_a3,_a4,_a5,_a6,_a7;
         double *_cp;
         ap=a;
         bp=b;
         _cp=cp;c0_0=_cp[0];
         _cp+=Cstride;c1_0=_cp[0];
         _cp+=Cstride;c2_0=_cp[0];
         _cp+=Cstride;c3_0=_cp[0];
         _cp+=Cstride;c4_0=_cp[0];
         _cp+=Cstride;c5_0=_cp[0];
         _cp+=Cstride;c6_0=_cp[0];
         _cp+=Cstride;c7_0=_cp[0];
         for (;ap!=ap_endp; ) {
            /* Fixed M,K,N = 8,4,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            ap += Astride;

         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 8,2,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            ap += Astride;

         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 8,1,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            _a4 = ap[4];
            c4_0 += _a4*_b0; 
            _a5 = ap[5];
            c5_0 += _a5*_b0; 
            _a6 = ap[6];
            c6_0 += _a6*_b0; 
            _a7 = ap[7];
            c7_0 += _a7*_b0; 
            ap += Astride;

         }
         _cp=cp;_cp[0]=c0_0;
         _cp+=Cstride;_cp[0]=c1_0;
         _cp+=Cstride;_cp[0]=c2_0;
         _cp+=Cstride;_cp[0]=c3_0;
         _cp+=Cstride;_cp[0]=c4_0;
         _cp+=Cstride;_cp[0]=c5_0;
         _cp+=Cstride;_cp[0]=c6_0;
         _cp+=Cstride;_cp[0]=c7_0;
      }
      c += Cstride*8;
      a += 8;
   }
   if (m_marg_el & 0x4) {
      const double* const ap_endp = a + k_norm;
      double* const cp_endp = c + n_norm;
      for (b=B,cp=c; cp!=cp_endp; b+=1,cp+=1) {
         register double _b0;
         register double _a0,_a1,_a2,_a3;
         double *_cp;
         ap=a;
         bp=b;
         _cp=cp;c0_0=_cp[0];
         _cp+=Cstride;c1_0=_cp[0];
         _cp+=Cstride;c2_0=_cp[0];
         _cp+=Cstride;c3_0=_cp[0];
         for (;ap!=ap_endp; ) {
            /* Fixed M,K,N = 4,4,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            ap += Astride;

         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 4,2,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            ap += Astride;

         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 4,1,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            _a2 = ap[2];
            c2_0 += _a2*_b0; 
            _a3 = ap[3];
            c3_0 += _a3*_b0; 
            ap += Astride;

         }
         _cp=cp;_cp[0]=c0_0;
         _cp+=Cstride;_cp[0]=c1_0;
         _cp+=Cstride;_cp[0]=c2_0;
         _cp+=Cstride;_cp[0]=c3_0;
      }
      c += Cstride*4;
      a += 4;
   }
   if (m_marg_el & 0x2) {
      const double* const ap_endp = a + k_norm;
      double* const cp_endp = c + n_norm;
      for (b=B,cp=c; cp!=cp_endp; b+=1,cp+=1) {
         register double _b0;
         register double _a0,_a1;
         double *_cp;
         ap=a;
         bp=b;
         _cp=cp;c0_0=_cp[0];
         _cp+=Cstride;c1_0=_cp[0];
         for (;ap!=ap_endp; ) {
            /* Fixed M,K,N = 2,4,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            ap += Astride;

         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 2,2,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            ap += Astride;

         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 2,1,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            _a1 = ap[1];
            c1_0 += _a1*_b0; 
            ap += Astride;

         }
         _cp=cp;_cp[0]=c0_0;
         _cp+=Cstride;_cp[0]=c1_0;
      }
      c += Cstride*2;
      a += 2;
   }
   if (m_marg_el & 0x1) {
      const double* const ap_endp = a + k_norm;
      double* const cp_endp = c + n_norm;
      for (b=B,cp=c; cp!=cp_endp; b+=1,cp+=1) {
         register double _b0;
         register double _a0;
         double *_cp;
         ap=a;
         bp=b;
         _cp=cp;c0_0=_cp[0];
         for (;ap!=ap_endp; ) {
            /* Fixed M,K,N = 1,4,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            ap += Astride;

         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 1,2,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            ap += Astride;
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            ap += Astride;

         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 1,1,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp[0]; 
            bp += Bstride;
            _a0 = ap[0];
            c0_0 += _a0*_b0; 
            ap += Astride;

         }
         _cp=cp;_cp[0]=c0_0;
      }
   }
}
