diff --git a/src/USER-INTEL/angle_charmm_intel.cpp b/src/USER-INTEL/angle_charmm_intel.cpp
index d55afd474..031c96420 100644
--- a/src/USER-INTEL/angle_charmm_intel.cpp
+++ b/src/USER-INTEL/angle_charmm_intel.cpp
@@ -1,361 +1,361 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include <stdlib.h>
 #include "angle_charmm_intel.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "comm.h"
 #include "force.h"
 #include "math_const.h"
 #include "memory.h"
 #include "suffix.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define SMALL2     (flt_t)0.000001
 #define INVSMALL   (flt_t)1000.0
 typedef struct { int a,b,c,t;  } int4_t;
 
 /* ---------------------------------------------------------------------- */
 
 AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 AngleCharmmIntel::~AngleCharmmIntel()
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
 void AngleCharmmIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     AngleCharmm::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void AngleCharmmIntel::compute(int eflag, int vflag,
                                IntelBuffers<flt_t,acc_t> *buffers,
                                const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
         eval<0,1,1>(vflag, buffers, fc);
       else
         eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
         eval<1,1,1>(vflag, buffers, fc);
       else
         eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void AngleCharmmIntel::eval(const int vflag,
                             IntelBuffers<flt_t,acc_t> *buffers,
                             const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->nanglelist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
   if (EFLAG) oeangle = (acc_t)0.0;
   if (VFLAG && vflag) {
     ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc) \
     reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
     int nfrom, npl, nto, tid;
     #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
     #else
     IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
     #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int4_t * _noalias const anglelist =
       (int4_t *) neighbor->anglelist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
     acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5;
     if (EFLAG) seangle = (acc_t)0.0;
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
     #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
     #endif
       const int i1 = anglelist[n].a;
       const int i2 = anglelist[n].b;
       const int i3 = anglelist[n].c;
       const int type = anglelist[n].t;
 
       // 1st bond
 
       const flt_t delx1 = x[i1].x - x[i2].x;
       const flt_t dely1 = x[i1].y - x[i2].y;
       const flt_t delz1 = x[i1].z - x[i2].z;
 
       const flt_t rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
       flt_t ir12 = (flt_t)1.0/sqrt(rsq1);
 
       // 2nd bond
 
       const flt_t delx2 = x[i3].x - x[i2].x;
       const flt_t dely2 = x[i3].y - x[i2].y;
       const flt_t delz2 = x[i3].z - x[i2].z;
 
       const flt_t rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
       ir12 *= (flt_t)1.0/sqrt(rsq2);
 
       // Urey-Bradley bond
 
       const flt_t delxUB = x[i3].x - x[i1].x;
       const flt_t delyUB = x[i3].y - x[i1].y;
       const flt_t delzUB = x[i3].z - x[i1].z;
 
       const flt_t rsqUB = delxUB*delxUB + delyUB*delyUB + delzUB*delzUB;
       const flt_t irUB = (flt_t)1.0/sqrt(rsqUB);
 
       // Urey-Bradley force & energy
 
       const flt_t dr = (flt_t)1.0/irUB - fc.fc[type].r_ub;
       const flt_t rk = fc.fc[type].k_ub * dr;
 
       flt_t forceUB;
       if (rsqUB > (flt_t)0.0) forceUB = (flt_t)-2.0*rk*irUB;
       else forceUB = 0.0;
 
       flt_t eangle;
       if (EFLAG) eangle = rk*dr;
 
       // angle (cos and sin)
 
       flt_t c = delx1*delx2 + dely1*dely2 + delz1*delz2;
       c *= ir12;
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       const flt_t sd = (flt_t)1.0 - c * c;
       flt_t s = (flt_t)1.0 / sqrt(sd);
       if (sd < SMALL2) s = INVSMALL;
 
       // harmonic force & energy
 
       const flt_t dtheta = acos(c) - fc.fc[type].theta0;
       const flt_t tk = fc.fc[type].k * dtheta;
 
       if (EFLAG) eangle += tk*dtheta;
 
       const flt_t a = (flt_t)-2.0 * tk * s;
       const flt_t a11 = a*c / rsq1;
       const flt_t a12 = -a * ir12;
       const flt_t a22 = a*c / rsq2;
 
       const flt_t f1x = a11*delx1 + a12*delx2 - delxUB*forceUB;
       const flt_t f1y = a11*dely1 + a12*dely2 - delyUB*forceUB;
       const flt_t f1z = a11*delz1 + a12*delz2 - delzUB*forceUB;
 
       const flt_t f3x = a22*delx2 + a12*delx1 + delxUB*forceUB;
       const flt_t f3y = a22*dely2 + a12*dely1 + delyUB*forceUB;
       const flt_t f3z = a22*delz2 + a12*delz1 + delzUB*forceUB;
 
       // apply force to each of 3 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += f1x;
           f[i1].y += f1y;
           f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x -= f1x + f3x;
           f[i2].y -= f1y + f3y;
           f[i2].z -= f1z + f3z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
           f[i3].x += f3x;
           f[i3].y += f3y;
           f[i3].z += f3z;
         }
       }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
         IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
                               i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
                               dely1, delz1, delx2, dely2, delz2, seangle,
                               f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
                               sv4, sv5);
         #else
         IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
                               i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
                               dely1, delz1, delx2, dely2, delz2, oeangle,
                               f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
                               ov4, ov5);
         #endif
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
     if (EFLAG) oeangle += seangle;
     if (VFLAG && vflag) {
         ov0 += sv0; ov1 += sv1; ov2 += sv2;
         ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
 
   if (EFLAG) energy += oeangle;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
     virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void AngleCharmmIntel::init_style()
 {
   AngleCharmm::init_style();
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void AngleCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
                                         IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->nangletypes + 1;
   fc.set_ntypes(bp1,memory);
 
-  for (int i = 0; i < bp1; i++) {
+  for (int i = 1; i < bp1; i++) {
     fc.fc[i].k = k[i];
     fc.fc[i].theta0 = theta0[i];
     fc.fc[i].k_ub = k_ub[i];
     fc.fc[i].r_ub = r_ub[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void AngleCharmmIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
                                                      Memory *memory) {
   if (nangletypes != _nangletypes) {
     if (_nangletypes > 0)
       _memory->destroy(fc);
 
     if (nangletypes > 0)
       _memory->create(fc,nangletypes,"anglecharmmintel.fc");
   }
   _nangletypes = nangletypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/angle_harmonic_intel.cpp b/src/USER-INTEL/angle_harmonic_intel.cpp
index 47e0add69..84220277d 100644
--- a/src/USER-INTEL/angle_harmonic_intel.cpp
+++ b/src/USER-INTEL/angle_harmonic_intel.cpp
@@ -1,341 +1,341 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include <stdlib.h>
 #include "angle_harmonic_intel.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "comm.h"
 #include "force.h"
 #include "math_const.h"
 #include "memory.h"
 #include "suffix.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define SMALL2 (flt_t)0.000001
 #define INVSMALL (flt_t)1000.0
 typedef struct { int a,b,c,t;  } int4_t;
 
 /* ---------------------------------------------------------------------- */
 
 AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 AngleHarmonicIntel::~AngleHarmonicIntel()
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
 void AngleHarmonicIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     AngleHarmonic::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void AngleHarmonicIntel::compute(int eflag, int vflag,
                                IntelBuffers<flt_t,acc_t> *buffers,
                                const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
         eval<0,1,1>(vflag, buffers, fc);
       else
         eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
         eval<1,1,1>(vflag, buffers, fc);
       else
         eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void AngleHarmonicIntel::eval(const int vflag,
                             IntelBuffers<flt_t,acc_t> *buffers,
                             const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->nanglelist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
   if (EFLAG) oeangle = (acc_t)0.0;
   if (VFLAG && vflag) {
     ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc) \
     reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
     int nfrom, npl, nto, tid;
     #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
     #else
     IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
     #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int4_t * _noalias const anglelist =
       (int4_t *) neighbor->anglelist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
     acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5;
     if (EFLAG) seangle = (acc_t)0.0;
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
     #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
     #endif
       const int i1 = anglelist[n].a;
       const int i2 = anglelist[n].b;
       const int i3 = anglelist[n].c;
       const int type = anglelist[n].t;
 
       // 1st bond
 
       const flt_t delx1 = x[i1].x - x[i2].x;
       const flt_t dely1 = x[i1].y - x[i2].y;
       const flt_t delz1 = x[i1].z - x[i2].z;
 
       const flt_t rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
       const flt_t r1 = (flt_t)1.0/sqrt(rsq1);
 
       // 2nd bond
 
       const flt_t delx2 = x[i3].x - x[i2].x;
       const flt_t dely2 = x[i3].y - x[i2].y;
       const flt_t delz2 = x[i3].z - x[i2].z;
 
       const flt_t rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
       const flt_t r2 = (flt_t)1.0/sqrt(rsq2);
 
       // angle (cos and sin)
 
       flt_t c = delx1*delx2 + dely1*dely2 + delz1*delz2;
       const flt_t r1r2 = r1 * r2;
       c *= r1r2;
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       const flt_t sd = (flt_t)1.0 - c * c;
       flt_t s = (flt_t)1.0/sqrt(sd);
       if (sd < SMALL2) s = INVSMALL;
 
       // harmonic force & energy
 
       const flt_t dtheta = acos(c) - fc.fc[type].theta0;
       const flt_t tk = fc.fc[type].k * dtheta;
 
       flt_t eangle;
       if (EFLAG) eangle = tk*dtheta;
 
       const flt_t a = (flt_t)-2.0 * tk * s;
       const flt_t ac = a*c;
       const flt_t a11 = ac / rsq1;
       const flt_t a12 = -a * (r1r2);
       const flt_t a22 = ac / rsq2;
 
       const flt_t f1x = a11*delx1 + a12*delx2;
       const flt_t f1y = a11*dely1 + a12*dely2;
       const flt_t f1z = a11*delz1 + a12*delz2;
 
       const flt_t f3x = a22*delx2 + a12*delx1;
       const flt_t f3y = a22*dely2 + a12*dely1;
       const flt_t f3z = a22*delz2 + a12*delz1;
 
       // apply force to each of 3 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += f1x;
           f[i1].y += f1y;
           f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x -= f1x + f3x;
           f[i2].y -= f1y + f3y;
           f[i2].z -= f1z + f3z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
           f[i3].x += f3x;
           f[i3].y += f3y;
           f[i3].z += f3z;
         }
       }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
         IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
                               f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
                               delz1, delx2, dely2, delz2, seangle, f,
                               NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4,
                               sv5);
         #else
         IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
                               f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
                               delz1, delx2, dely2, delz2, oeangle, f,
                               NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4,
                               ov5);
         #endif
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
     if (EFLAG) oeangle += seangle;
     if (VFLAG && vflag) {
         ov0 += sv0; ov1 += sv1; ov2 += sv2;
         ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
 
   if (EFLAG) energy += oeangle;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
     virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void AngleHarmonicIntel::init_style()
 {
   AngleHarmonic::init_style();
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void AngleHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
                                         IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->nangletypes + 1;
   fc.set_ntypes(bp1,memory);
 
-  for (int i = 0; i < bp1; i++) {
+  for (int i = 1; i < bp1; i++) {
     fc.fc[i].k = k[i];
     fc.fc[i].theta0 = theta0[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void AngleHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
                                                      Memory *memory) {
   if (nangletypes != _nangletypes) {
     if (_nangletypes > 0)
       _memory->destroy(fc);
 
     if (nangletypes > 0)
       _memory->create(fc,nangletypes,"anglecharmmintel.fc");
   }
   _nangletypes = nangletypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/bond_fene_intel.cpp b/src/USER-INTEL/bond_fene_intel.cpp
index bb96135b2..93d64ed63 100644
--- a/src/USER-INTEL/bond_fene_intel.cpp
+++ b/src/USER-INTEL/bond_fene_intel.cpp
@@ -1,320 +1,320 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Stan Moore (Sandia)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include <stdlib.h>
 #include "bond_fene_intel.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "comm.h"
 #include "force.h"
 #include "memory.h"
 #include "suffix.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 typedef struct { int a,b,t;  } int3_t;
 
 /* ---------------------------------------------------------------------- */
 
 BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 BondFENEIntel::~BondFENEIntel()
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
 void BondFENEIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     BondFENE::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void BondFENEIntel::compute(int eflag, int vflag,
                                 IntelBuffers<flt_t,acc_t> *buffers,
                                 const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
         eval<0,1,1>(vflag, buffers, fc);
       else
         eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
         eval<1,1,1>(vflag, buffers, fc);
       else
         eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void BondFENEIntel::eval(const int vflag,
                          IntelBuffers<flt_t,acc_t> *buffers,
                          const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nbondlist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
   if (EFLAG) oebond = (acc_t)0.0;
   if (VFLAG && vflag) {
     ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc)           \
     reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
     int nfrom, npl, nto, tid;
     #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
     #else
     IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
     #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int3_t * _noalias const bondlist =
       (int3_t *) neighbor->bondlist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
     acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5;
     if (EFLAG) sebond = (acc_t)0.0;
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
     #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
     #endif
       const int i1 = bondlist[n].a;
       const int i2 = bondlist[n].b;
       const int type = bondlist[n].t;
 
       const flt_t ir0sq = fc.fc[type].ir0sq;
       const flt_t k = fc.fc[type].k;
       const flt_t sigma = fc.fc[type].sigma;
       const flt_t sigmasq = sigma*sigma;
       const flt_t epsilon = fc.fc[type].epsilon;
 
       const flt_t delx = x[i1].x - x[i2].x;
       const flt_t dely = x[i1].y - x[i2].y;
       const flt_t delz = x[i1].z - x[i2].z;
 
       const flt_t rsq = delx*delx + dely*dely + delz*delz;
       flt_t rlogarg = (flt_t)1.0 - rsq * ir0sq;
       flt_t irsq = (flt_t)1.0 / rsq;
 
       // if r -> r0, then rlogarg < 0.0 which is an error
       // issue a warning and reset rlogarg = epsilon
       // if r > 2*r0 something serious is wrong, abort
 
       if (rlogarg < (flt_t)0.1) {
         char str[128];
         sprintf(str,"FENE bond too long: " BIGINT_FORMAT " "
                 TAGINT_FORMAT " " TAGINT_FORMAT " %g",
                 update->ntimestep,atom->tag[i1],atom->tag[i2],sqrt(rsq));
         error->warning(FLERR,str,0);
         if (rlogarg <= (flt_t)-3.0) error->one(FLERR,"Bad FENE bond");
         rlogarg = (flt_t)0.1;
       }
 
       flt_t fbond = -k/rlogarg;
 
       // force from LJ term
 
       flt_t sr2,sr6;
       if (rsq < (flt_t)TWO_1_3*sigmasq) {
         sr2 = sigmasq * irsq;
         sr6 = sr2 * sr2 * sr2;
         fbond += (flt_t)48.0 * epsilon * sr6 * (sr6 - (flt_t)0.5) * irsq;
       }
 
       // energy
 
       flt_t ebond;
       if (EFLAG) {
         ebond = (flt_t)-0.5 * k / ir0sq * log(rlogarg);
         if (rsq < (flt_t)TWO_1_3 * sigmasq)
           ebond += (flt_t)4.0 * epsilon * sr6 * (sr6 - (flt_t)1.0) + epsilon;
       }
 
       // apply force to each of 2 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += delx*fbond;
           f[i1].y += dely*fbond;
           f[i1].z += delz*fbond;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x -= delx*fbond;
           f[i2].y -= dely*fbond;
           f[i2].z -= delz*fbond;
         }
       }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
         IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
                              delx, dely, delz, sebond, f, NEWTON_BOND,
                              nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
         #else
         IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
                              delx, dely, delz, oebond, f, NEWTON_BOND,
                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
         #endif
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
     if (EFLAG) oebond += sebond;
     if (VFLAG && vflag) {
        ov0 += sv0; ov1 += sv1; ov2 += sv2;
        ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
 
   if (EFLAG) energy += oebond;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
     virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void BondFENEIntel::init_style()
 {
   BondFENE::init_style();
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc,
                                          IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->nbondtypes + 1;
   fc.set_ntypes(bp1,memory);
 
-  for (int i = 0; i < bp1; i++) {
+  for (int i = 1; i < bp1; i++) {
     fc.fc[i].k = k[i];
     fc.fc[i].ir0sq = 1.0 / (r0[i] * r0[i]);
     fc.fc[i].sigma = sigma[i];
     fc.fc[i].epsilon = epsilon[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void BondFENEIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
                                                       Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(fc);
 
     if (nbondtypes > 0)
       _memory->create(fc,nbondtypes,"bondfeneintel.fc");
   }
   _nbondtypes = nbondtypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/bond_harmonic_intel.cpp b/src/USER-INTEL/bond_harmonic_intel.cpp
index beb0ebcdd..0ac466f11 100644
--- a/src/USER-INTEL/bond_harmonic_intel.cpp
+++ b/src/USER-INTEL/bond_harmonic_intel.cpp
@@ -1,289 +1,289 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include <stdlib.h>
 #include "bond_harmonic_intel.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "comm.h"
 #include "force.h"
 #include "memory.h"
 #include "suffix.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
 typedef struct { int a,b,t;  } int3_t;
 
 /* ---------------------------------------------------------------------- */
 
 BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 BondHarmonicIntel::~BondHarmonicIntel()
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
 void BondHarmonicIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     BondHarmonic::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void BondHarmonicIntel::compute(int eflag, int vflag,
                                 IntelBuffers<flt_t,acc_t> *buffers,
                                 const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
         eval<0,1,1>(vflag, buffers, fc);
       else
         eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
         eval<1,1,1>(vflag, buffers, fc);
       else
         eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void BondHarmonicIntel::eval(const int vflag,
                              IntelBuffers<flt_t,acc_t> *buffers,
                              const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nbondlist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
   if (EFLAG) oebond = (acc_t)0.0;
   if (VFLAG && vflag) {
     ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc)           \
     reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
     int nfrom, npl, nto, tid;
     #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
     #else
     IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
     #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int3_t * _noalias const bondlist =
       (int3_t *) neighbor->bondlist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
     acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5;
     if (EFLAG) sebond = (acc_t)0.0;
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
     #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
     #endif
       const int i1 = bondlist[n].a;
       const int i2 = bondlist[n].b;
       const int type = bondlist[n].t;
 
       const flt_t delx = x[i1].x - x[i2].x;
       const flt_t dely = x[i1].y - x[i2].y;
       const flt_t delz = x[i1].z - x[i2].z;
 
       const flt_t rsq = delx*delx + dely*dely + delz*delz;
       const flt_t r = sqrt(rsq);
       const flt_t dr = r - fc.fc[type].r0;
       const flt_t rk = fc.fc[type].k * dr;
 
       // force & energy
 
       flt_t fbond;
       if (r > (flt_t)0.0) fbond = (flt_t)-2.0*rk/r;
       else fbond = (flt_t)0.0;
 
       flt_t ebond;
       if (EFLAG) ebond = rk*dr;
 
       // apply force to each of 2 atoms
       #ifdef LMP_INTEL_USE_SIMDOFF
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += delx*fbond;
           f[i1].y += dely*fbond;
           f[i1].z += delz*fbond;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x -= delx*fbond;
           f[i2].y -= dely*fbond;
           f[i2].z -= delz*fbond;
         }
       }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
         IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
                              fbond, delx, dely, delz, sebond, f,
                              NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
                              sv4, sv5);
         #else
         IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
                              fbond, delx, dely, delz, oebond, f,
                              NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
                              ov4, ov5);
         #endif
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
     if (EFLAG) oebond += sebond;
     if (VFLAG && vflag) {
        ov0 += sv0; ov1 += sv1; ov2 += sv2;
        ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
 
   if (EFLAG) energy += oebond;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
     virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void BondHarmonicIntel::init_style()
 {
   BondHarmonic::init_style();
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void BondHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
                                          IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->nbondtypes + 1;
   fc.set_ntypes(bp1,memory);
 
-  for (int i = 0; i < bp1; i++) {
+  for (int i = 1; i < bp1; i++) {
     fc.fc[i].k = k[i];
     fc.fc[i].r0 = r0[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void BondHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
                                                       Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(fc);
 
     if (nbondtypes > 0)
       _memory->create(fc,nbondtypes,"bondharmonicintel.fc");
   }
   _nbondtypes = nbondtypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/dihedral_charmm_intel.cpp b/src/USER-INTEL/dihedral_charmm_intel.cpp
index 715cef4d3..0e13e9225 100644
--- a/src/USER-INTEL/dihedral_charmm_intel.cpp
+++ b/src/USER-INTEL/dihedral_charmm_intel.cpp
@@ -1,991 +1,991 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <mpi.h>
 #include <math.h>
 #include "dihedral_charmm_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "force.h"
 #include "pair.h"
 #include "update.h"
 #include "error.h"
 
 #ifdef LMP_USE_AVXCD
 #if (__INTEL_COMPILER_BUILD_DATE > 20160414)
 #define LMP_USE_AVXCD_DHC
 #endif
 #endif
 
 #ifdef LMP_USE_AVXCD_DHC
 #include "intel_simd.h"
 using namespace ip_simd;
 #endif
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define PTOLERANCE (flt_t)1.05
 #define MTOLERANCE (flt_t)-1.05
 typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
 DihedralCharmmIntel::DihedralCharmmIntel(class LAMMPS *lmp)
   : DihedralCharmm(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralCharmmIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     DihedralCharmm::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralCharmmIntel::compute(int eflag, int vflag,
                                   IntelBuffers<flt_t,acc_t> *buffers,
                                   const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = 0;
 
   // insure pair->ev_tally() will use 1-4 virial contribution
 
   if (weightflag && vflag_global == 2)
     force->pair->vflag_either = force->pair->vflag_global = 1;
 
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
         eval<0,1,1>(vflag, buffers, fc);
       else
         eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
         eval<1,1,1>(vflag, buffers, fc);
       else
         eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 #ifndef LMP_USE_AVXCD_DHC
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralCharmmIntel::eval(const int vflag,
                                IntelBuffers<flt_t,acc_t> *buffers,
                                const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   flt_t * _noalias const q = buffers->get_q(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
   if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0;
   if (VFLAG && vflag) {
     ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
     opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc)           \
     reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
               opv0,opv1,opv2,opv3,opv4,opv5)
   #endif
   {
     #if defined(LMP_SIMD_COMPILER_TEST)
     int nfrom, nto, tid;
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
     #else
     int nfrom, npl, nto, tid;
     IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
     #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int5_t * _noalias const dihedrallist =
       (int5_t *) neighbor->dihedrallist[0];
     const flt_t qqrd2e = force->qqrd2e;
 
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
     if (EFLAG) sevdwl = secoul = sedihedral = (acc_t)0.0;
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
       spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
     }
 
     #if defined(LMP_SIMD_COMPILER_TEST)
     #pragma vector aligned
     #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
                            sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
     for (int n = nfrom; n < nto; n++) {
     #endif
     for (int n = nfrom; n < nto; n += npl) {
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
       const int i4 = dihedrallist[n].d;
       const int type = dihedrallist[n].t;
 
       // 1st bond
 
       const flt_t vb1x = x[i1].x - x[i2].x;
       const flt_t vb1y = x[i1].y - x[i2].y;
       const flt_t vb1z = x[i1].z - x[i2].z;
       const int itype = x[i1].w;
 
       // 2nd bond
 
       const flt_t vb2xm = x[i2].x - x[i3].x;
       const flt_t vb2ym = x[i2].y - x[i3].y;
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
 
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
       const int jtype = x[i4].w;
 
       // 1-4
 
       const flt_t delx = x[i1].x - x[i4].x;
       const flt_t dely = x[i1].y - x[i4].y;
       const flt_t delz = x[i1].z - x[i4].z;
 
 
       // c,s calculation
 
       const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
       const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
       const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
       const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
       const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
       const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
 
       const flt_t rasq = ax*ax + ay*ay + az*az;
       const flt_t rbsq = bx*bx + by*by + bz*bz;
       const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
       const flt_t rg = sqrt(rgsq);
 
       flt_t rginv, ra2inv, rb2inv;
       rginv = ra2inv = rb2inv = (flt_t)0.0;
       if (rg > 0) rginv = (flt_t)1.0/rg;
       if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
       if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
       const flt_t rabinv = sqrt(ra2inv*rb2inv);
 
       flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
       const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
 
       // error check
       #ifndef LMP_SIMD_COMPILER_TEST
       if (c > PTOLERANCE || c < MTOLERANCE) {
         int me = comm->me;
 
         if (screen) {
           char str[128];
           sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT,
                   me,tid,update->ntimestep,
                   atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
           error->warning(FLERR,str,0);
           fprintf(screen,"  1st atom: %d %g %g %g\n",
                   me,x[i1].x,x[i1].y,x[i1].z);
           fprintf(screen,"  2nd atom: %d %g %g %g\n",
                   me,x[i2].x,x[i2].y,x[i2].z);
           fprintf(screen,"  3rd atom: %d %g %g %g\n",
                   me,x[i3].x,x[i3].y,x[i3].z);
           fprintf(screen,"  4th atom: %d %g %g %g\n",
                   me,x[i4].x,x[i4].y,x[i4].z);
         }
       }
       #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       const flt_t tcos_shift = fc.bp[type].cos_shift;
       const flt_t tsin_shift = fc.bp[type].sin_shift;
       const flt_t tk = fc.bp[type].k;
       const int m = fc.bp[type].multiplicity;
 
       flt_t p = (flt_t)1.0;
       flt_t ddf1, df1;
       ddf1 = df1 = (flt_t)0.0;
 
       for (int i = 0; i < m; i++) {
         ddf1 = p*c - df1*s;
         df1 = p*s + df1*c;
         p = ddf1;
       }
 
       p = p*tcos_shift + df1*tsin_shift;
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 *= -m;
       p += (flt_t)1.0;
 
       if (m == 0) {
         p = (flt_t)1.0 + tcos_shift;
         df1 = (flt_t)0.0;
       }
 
       const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
       const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
       const flt_t fga = fg*ra2inv*rginv;
       const flt_t hgb = hg*rb2inv*rginv;
       const flt_t gaa = -ra2inv*rg;
       const flt_t gbb = rb2inv*rg;
 
       const flt_t dtfx = gaa*ax;
       const flt_t dtfy = gaa*ay;
       const flt_t dtfz = gaa*az;
       const flt_t dtgx = fga*ax - hgb*bx;
       const flt_t dtgy = fga*ay - hgb*by;
       const flt_t dtgz = fga*az - hgb*bz;
       const flt_t dthx = gbb*bx;
       const flt_t dthy = gbb*by;
       const flt_t dthz = gbb*bz;
 
       const flt_t df = -tk * df1;
 
       const flt_t sx2 = df*dtgx;
       const flt_t sy2 = df*dtgy;
       const flt_t sz2 = df*dtgz;
 
       flt_t f1x = df*dtfx;
       flt_t f1y = df*dtfy;
       flt_t f1z = df*dtfz;
 
       const flt_t f2x = sx2 - f1x;
       const flt_t f2y = sy2 - f1y;
       const flt_t f2z = sz2 - f1z;
 
       flt_t f4x = df*dthx;
       flt_t f4y = df*dthy;
       flt_t f4z = df*dthz;
 
       const flt_t f3x = -sx2 - f4x;
       const flt_t f3y = -sy2 - f4y;
       const flt_t f3z = -sz2 - f4z;
 
       if (EFLAG || VFLAG) {
         flt_t deng;
         if (EFLAG) deng = tk * p;
         IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3,
                               i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
                               f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
                               vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND,
                               nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
       }
 
 
       #if defined(LMP_SIMD_COMPILER_TEST)
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x += f2x;
           f[i2].y += f2y;
           f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
           f[i3].x += f3x;
           f[i3].y += f3y;
           f[i3].z += f3z;
         }
       }
 
       // 1-4 LJ and Coulomb interactions
       // tally energy/virial in pair, using newton_bond as newton flag
 
       const flt_t tweight = fc.weight[type];
       const flt_t rsq = delx*delx + dely*dely + delz*delz;
       const flt_t r2inv = (flt_t)1.0/rsq;
       const flt_t r6inv = r2inv*r2inv*r2inv;
 
       flt_t forcecoul;
       if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
       else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
       const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv -
                                      fc.ljp[itype][jtype].lj2);
       const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv;
 
       if (NEWTON_BOND || i1 < nlocal) {
         f1x += delx*fpair;
         f1y += dely*fpair;
         f1z += delz*fpair;
       }
       if (NEWTON_BOND || i4 < nlocal) {
         f4x -= delx*fpair;
         f4y -= dely*fpair;
         f4z -= delz*fpair;
       }
 
       if (EFLAG || VFLAG) {
         flt_t ev_pre = (flt_t)0;
         if (NEWTON_BOND || i1 < nlocal)
           ev_pre += (flt_t)0.5;
         if (NEWTON_BOND || i4 < nlocal)
           ev_pre += (flt_t)0.5;
 
         if (EFLAG) {
           flt_t ecoul, evdwl;
           ecoul = tweight * forcecoul;
           evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv -
                                      fc.ljp[itype][jtype].lj4);
           secoul += ev_pre * ecoul;
           sevdwl += ev_pre * evdwl;
           if (eatom) {
             evdwl *= (flt_t)0.5;
             evdwl += (flt_t)0.5 * ecoul;
             if (NEWTON_BOND || i1 < nlocal)
               f[i1].w += evdwl;
             if (NEWTON_BOND || i4 < nlocal)
               f[i4].w += evdwl;
           }
         }
         //            IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
         //                                 delx, dely, delz);
         if (VFLAG && vflag) {
           spv0 += ev_pre * delx * delx * fpair;
           spv1 += ev_pre * dely * dely * fpair;
           spv2 += ev_pre * delz * delz * fpair;
           spv3 += ev_pre * delx * dely * fpair;
           spv4 += ev_pre * delx * delz * fpair;
           spv5 += ev_pre * dely * delz * fpair;
         }
       }
 
       // apply force to each of 4 atoms
       #if defined(LMP_SIMD_COMPILER_TEST)
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += f1x;
           f[i1].y += f1y;
           f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
           f[i4].x += f4x;
           f[i4].y += f4y;
           f[i4].z += f4z;
         }
       }
     } // for n
     if (EFLAG) {
       oedihedral += sedihedral;
       oecoul += secoul;
       oevdwl += sevdwl;
     }
     if (VFLAG && vflag) {
       ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
       opv0 += spv0; opv1 += spv1; opv2 += spv2;
       opv3 += spv3; opv4 += spv4; opv5 += spv5;
     }
   } // omp parallel
 
   if (EFLAG) {
     energy += oedihedral;
     force->pair->eng_vdwl += oevdwl;
     force->pair->eng_coul += oecoul;
   }
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
     virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
     force->pair->virial[0] += opv0;
     force->pair->virial[1] += opv1;
     force->pair->virial[2] += opv2;
     force->pair->virial[3] += opv3;
     force->pair->virial[4] += opv4;
     force->pair->virial[5] += opv5;
   }
 
   fix->set_reduce_flag();
 }
 
 #else
 
 /* ----------------------------------------------------------------------
 
 Vector intrinsics are temporarily being used for the Stillinger-Weber
 potential to allow for advanced features in the AVX512 instruction set to
 be exploited on early hardware. We hope to see compiler improvements for
 AVX512 that will eliminate this requirement, so it is not recommended to
 develop code based on the intrinsics implementation. Please e-mail the
 authors for more details.
 
 ------------------------------------------------------------------------- */
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralCharmmIntel::eval(const int vflag,
                                IntelBuffers<flt_t,acc_t> *buffers,
                                const ForceConst<flt_t> &fc)
 
 {
   typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
   typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t;
   const int swidth = SIMD_type<flt_t>::width();
 
   const int inum = neighbor->ndihedrallist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   flt_t * _noalias const q = buffers->get_q(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
   if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0;
   if (VFLAG && vflag) {
     ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
     opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc)           \
     reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
               opv0,opv1,opv2,opv3,opv4,opv5)
   #endif
   {
     int nfrom, npl, nto, tid;
     IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads,
                              swidth);
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int * _noalias const dihedrallist =
       (int *) neighbor->dihedrallist[0];
     const flt_t * _noalias const weight = &(fc.weight[0]);
     const flt_t * _noalias const x_f = &(x[0].x);
     const flt_t * _noalias const cos_shift = &(fc.bp[0].cos_shift);
     const flt_t * _noalias const sin_shift = &(fc.bp[0].sin_shift);
     const flt_t * _noalias const k = &(fc.bp[0].k);
     const int * _noalias const multiplicity = &(fc.bp[0].multiplicity);
     const flt_t * _noalias const plj1 = &(fc.ljp[0][0].lj1);
     const flt_t * _noalias const plj2 = &(fc.ljp[0][0].lj2);
     const flt_t * _noalias const plj3 = &(fc.ljp[0][0].lj3);
     const flt_t * _noalias const plj4 = &(fc.ljp[0][0].lj4);
     acc_t * _noalias const pforce= &(f[0].x);
     acc_t * _noalias const featom = &(f[0].w);
     const flt_t qqrd2e = force->qqrd2e;
 
     SIMD_acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     SIMD_acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
     if (EFLAG) {
       sevdwl = SIMD_set((acc_t)0.0);
       secoul = SIMD_set((acc_t)0.0);
       sedihedral = SIMD_set((acc_t)0.0);
     }
     if (VFLAG && vflag) {
       sv0 = SIMD_set((acc_t)0.0);
       sv1 = SIMD_set((acc_t)0.0);
       sv2 = SIMD_set((acc_t)0.0);
       sv3 = SIMD_set((acc_t)0.0);
       sv4 = SIMD_set((acc_t)0.0);
       sv5 = SIMD_set((acc_t)0.0);
       spv0 = SIMD_set((acc_t)0.0);
       spv1 = SIMD_set((acc_t)0.0);
       spv2 = SIMD_set((acc_t)0.0);
       spv3 = SIMD_set((acc_t)0.0);
       spv4 = SIMD_set((acc_t)0.0);
       spv5 = SIMD_set((acc_t)0.0);
     }
 
     SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
                                  55, 60, 65, 70, 75) + (nfrom * 5);
     const int nto5 = nto * 5;
     const int nlocals4 = nlocal << 4;
     const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
     const int ntypes = atom->ntypes + 1;
 
     for (int n = nfrom; n < nto; n += npl) {
       SIMD_mask nmask = n_offset < nto5;
       SIMD_int i1 = SIMD_gather(nmask, dihedrallist, n_offset);
       const SIMD_flt_t q1 = SIMD_gather(nmask, q, i1);
       i1 = i1 << 4;
       const SIMD_int i2 = SIMD_gather(nmask, dihedrallist+1, n_offset) << 4;
       const SIMD_int i3 = SIMD_gather(nmask, dihedrallist+2, n_offset) << 4;
       SIMD_int i4 = SIMD_gather(nmask, dihedrallist+3, n_offset);
       const SIMD_flt_t q4 = SIMD_gather(nmask, q, i4);
       i4 = i4 << 4;
       SIMD_int type = SIMD_gather(nmask, dihedrallist+4, n_offset);
       const SIMD_flt_t tweight = SIMD_gather(nmask, weight, type);
       type = type << 2;
       n_offset = n_offset + npl * 5;
 
       // 1st bond
 
       SIMD_flt_t x1, x2, y1, y2, z1, z2;
       SIMD_int itype;
 
       SIMD_atom_gather(nmask, x_f, i1, x1, y1, z1, itype);
       SIMD_atom_gather(nmask, x_f, i2, x2, y2, z2);
 
       const SIMD_flt_t vb1x = x1 - x2;
       const SIMD_flt_t vb1y = y1 - y2;
       const SIMD_flt_t vb1z = z1 - z2;
 
       // 2nd bond
 
       SIMD_flt_t x3, y3, z3;
 
       SIMD_atom_gather(nmask, x_f, i3, x3, y3, z3);
 
       const SIMD_flt_t vb2xm = x2 - x3;
       const SIMD_flt_t vb2ym = y2 - y3;
       const SIMD_flt_t vb2zm = z2 - z3;
 
       // 3rd bond
 
       SIMD_flt_t x4, y4, z4;
       SIMD_int jtype;
 
       SIMD_atom_gather(nmask, x_f, i4, x4, y4, z4, jtype);
 
       const SIMD_flt_t vb3x = x4 - x3;
       const SIMD_flt_t vb3y = y4 - y3;
       const SIMD_flt_t vb3z = z4 - z3;
 
       // 1-4
 
       const SIMD_flt_t delx = x1 - x4;
       const SIMD_flt_t dely = y1 - y4;
       const SIMD_flt_t delz = z1 - z4;
 
       // c,s calculation
 
       const SIMD_flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
       const SIMD_flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
       const SIMD_flt_t az = vb1x*vb2ym - vb1y*vb2xm;
       const SIMD_flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
       const SIMD_flt_t by = vb3z*vb2xm - vb3x*vb2zm;
       const SIMD_flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
 
       const SIMD_flt_t rasq = ax*ax + ay*ay + az*az;
       const SIMD_flt_t rbsq = bx*bx + by*by + bz*bz;
       const SIMD_flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
       const SIMD_flt_t rg = SIMD_sqrt(rgsq);
 
       const SIMD_flt_t szero = SIMD_set((flt_t)0.0);
       const SIMD_flt_t rginv = SIMD_rcpz(rg > szero, rg);
       const SIMD_flt_t ra2inv = SIMD_rcpz(rasq > szero, rasq);
       const SIMD_flt_t rb2inv = SIMD_rcpz(rbsq > szero, rbsq);
       const SIMD_flt_t rabinv = SIMD_sqrt(ra2inv*rb2inv);
 
       SIMD_flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
       const SIMD_flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
 
       // error check
       const SIMD_flt_t one = SIMD_set((flt_t)1.0);
       const SIMD_flt_t mone = SIMD_set((flt_t)-1.0);
 
       const SIMD_flt_t ptol = SIMD_set(PTOLERANCE);
       const SIMD_flt_t ntol = SIMD_set(MTOLERANCE);
       if (c > ptol || c < ntol)
         if (screen)
           error->warning(FLERR,"Dihedral problem.");
 
       c = SIMD_set(c, c > one, one);
       c = SIMD_set(c, c < mone, mone);
 
       const SIMD_flt_t tcos_shift = SIMD_gather(nmask, cos_shift, type);
       const SIMD_flt_t tsin_shift = SIMD_gather(nmask, sin_shift, type);
       const SIMD_flt_t tk = SIMD_gather(nmask, k, type);
       const SIMD_int m = SIMD_gatherz_offset<flt_t>(nmask, multiplicity, type);
 
       SIMD_flt_t p(one);
       SIMD_flt_t ddf1(szero);
       SIMD_flt_t df1(szero);
 
       const int m_max = SIMD_max(m);
 
       for (int i = 0; i < m_max; i++) {
         const SIMD_mask my_m = i < m;
         ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s);
         df1 = SIMD_set(df1, my_m, p*s + df1*c);
         p = SIMD_set(p, my_m, ddf1);
       }
 
       SIMD_flt_t multf;
       SIMD_cast(-m,multf);
       p = p*tcos_shift + df1*tsin_shift;
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 = df1 * multf;
       p = p + one;
 
       SIMD_mask mzero = (m == SIMD_set((int)0));
       p = SIMD_set(p, mzero, one + tcos_shift);
       df1 = SIMD_set(df1, mzero, szero);
 
       const SIMD_flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
       const SIMD_flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
       const SIMD_flt_t fga = fg*ra2inv*rginv;
       const SIMD_flt_t hgb = hg*rb2inv*rginv;
       const SIMD_flt_t gaa = -ra2inv*rg;
       const SIMD_flt_t gbb = rb2inv*rg;
 
       const SIMD_flt_t dtfx = gaa*ax;
       const SIMD_flt_t dtfy = gaa*ay;
       const SIMD_flt_t dtfz = gaa*az;
       const SIMD_flt_t dtgx = fga*ax - hgb*bx;
       const SIMD_flt_t dtgy = fga*ay - hgb*by;
       const SIMD_flt_t dtgz = fga*az - hgb*bz;
       const SIMD_flt_t dthx = gbb*bx;
       const SIMD_flt_t dthy = gbb*by;
       const SIMD_flt_t dthz = gbb*bz;
 
       const SIMD_flt_t df = -tk * df1;
 
       const SIMD_flt_t sx2 = df*dtgx;
       const SIMD_flt_t sy2 = df*dtgy;
       const SIMD_flt_t sz2 = df*dtgz;
 
       SIMD_flt_t f1x = df*dtfx;
       SIMD_flt_t f1y = df*dtfy;
       SIMD_flt_t f1z = df*dtfz;
 
       SIMD_flt_t f2x = sx2 - f1x;
       SIMD_flt_t f2y = sy2 - f1y;
       SIMD_flt_t f2z = sz2 - f1z;
 
       SIMD_flt_t f4x = df*dthx;
       SIMD_flt_t f4y = df*dthy;
       SIMD_flt_t f4z = df*dthz;
 
       SIMD_flt_t f3x = -sx2 - f4x;
       SIMD_flt_t f3y = -sy2 - f4y;
       SIMD_flt_t f3z = -sz2 - f4z;
 
       SIMD_flt_t qdeng;
       if (EFLAG || VFLAG) {
         SIMD_flt_t ev_pre;
         if (NEWTON_BOND) ev_pre = one;
         else {
           ev_pre = szero;
           const SIMD_flt_t quarter = SIMD_set((flt_t)0.25);
           ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter);
           ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter);
           ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter);
           ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter);
         }
         SIMD_zero_masked(nmask, ev_pre);
         if (EFLAG) {
           const SIMD_flt_t deng = tk * p;
           sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng);
           if (eatom) {
             qdeng = deng * SIMD_set((flt_t)0.25);
             SIMD_mask newton_mask;
             if (NEWTON_BOND) newton_mask = nmask;
             if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
             SIMD_flt_t ieng = qdeng;
             SIMD_jeng_update(newton_mask, featom, i2, ieng);
             ieng = qdeng;
             if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
             SIMD_jeng_update(newton_mask, featom, i3, ieng);
           }
         }
         if (VFLAG && vflag) {
           sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
           sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
           sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
           sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y));
           sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z));
           sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z));
         }
       }
 
       SIMD_mask newton_mask;
       if (NEWTON_BOND) newton_mask = nmask;
       if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
       SIMD_safe_jforce(newton_mask, pforce, i2, f2x, f2y, f2z);
       if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
       SIMD_safe_jforce(newton_mask, pforce, i3, f3x, f3y, f3z);
 
       // 1-4 LJ and Coulomb interactions
       // tally energy/virial in pair, using newton_bond as newton flag
 
       const SIMD_flt_t rsq = delx*delx + dely*dely + delz*delz;
       const SIMD_flt_t r2inv = SIMD_rcpz(nmask, rsq);
       const SIMD_flt_t r6inv = r2inv*r2inv*r2inv;
 
       const SIMD_flt_t simd_qqrd2e = SIMD_set(qqrd2e);
       SIMD_flt_t forcecoul;
       if (implicit) forcecoul = simd_qqrd2e * q1 * q4 * r2inv;
       else forcecoul = simd_qqrd2e * q1 * q4 * SIMD_sqrt(r2inv);
 
       const SIMD_int ijtype = (itype * ntypes + jtype) << 2;
       const SIMD_flt_t lj1 = SIMD_gather(nmask, plj1, ijtype);
       const SIMD_flt_t lj2 = SIMD_gather(nmask, plj2, ijtype);
       const SIMD_flt_t forcelj = r6inv * (lj1 * r6inv - lj2);
       const SIMD_flt_t fpair = tweight * (forcelj + forcecoul) * r2inv;
 
       f1x = f1x + delx * fpair;
       f1y = f1y + dely * fpair;
       f1z = f1z + delz * fpair;
       f4x = f4x - delx * fpair;
       f4y = f4y - dely * fpair;
       f4z = f4z - delz * fpair;
 
       if (EFLAG || VFLAG) {
         SIMD_flt_t ev_pre;
         if (NEWTON_BOND) ev_pre = one;
         else {
           ev_pre = szero;
           const SIMD_flt_t half = SIMD_set((flt_t)0.5);
           ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4,ev_pre,half);
           ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4,ev_pre,half);
         }
         SIMD_zero_masked(nmask, ev_pre);
 
         if (EFLAG) {
           const SIMD_flt_t ecoul = tweight * forcecoul;
           const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype);
           const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype);
           SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4);
           secoul = SIMD_ev_add(secoul, ev_pre * ecoul);
           sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl);
           if (eatom) {
             const SIMD_flt_t half = SIMD_set((flt_t)0.5);
             evdwl = evdwl * half;
             evdwl = evdwl + half * ecoul + qdeng;
 
             if (NEWTON_BOND) newton_mask = nmask;
             if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
             SIMD_flt_t ieng = evdwl;
             SIMD_jeng_update(newton_mask, featom, i1, ieng);
             ieng = evdwl;
             if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
             SIMD_jeng_update(newton_mask, featom, i4, ieng);
           }
         }
         if (VFLAG && vflag) {
           spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
           spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
           spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
           spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair);
           spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair);
           spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair);
         }
       }
 
       if (NEWTON_BOND) newton_mask = nmask;
       if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
       SIMD_safe_jforce(newton_mask, pforce, i1, f1x, f1y, f1z);
       if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
       SIMD_safe_jforce(newton_mask, pforce, i4, f4x, f4y, f4z);
     } // for n
 
     if (EFLAG) {
       oedihedral += SIMD_sum(sedihedral);
       oecoul += SIMD_sum(secoul);
       oevdwl += SIMD_sum(sevdwl);
     }
     if (VFLAG && vflag) {
       ov0 += SIMD_sum(sv0);
       ov1 += SIMD_sum(sv1);
       ov2 += SIMD_sum(sv2);
       ov3 += SIMD_sum(sv3);
       ov4 += SIMD_sum(sv4);
       ov5 += SIMD_sum(sv5);
       opv0 += SIMD_sum(spv0);
       opv1 += SIMD_sum(spv1);
       opv2 += SIMD_sum(spv2);
       opv3 += SIMD_sum(spv3);
       opv4 += SIMD_sum(spv4);
       opv5 += SIMD_sum(spv5);
     }
   } // omp parallel
 
   if (EFLAG) {
     energy += oedihedral;
     force->pair->eng_vdwl += oevdwl;
     force->pair->eng_coul += oecoul;
   }
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
     virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
     force->pair->virial[0] += opv0;
     force->pair->virial[1] += opv1;
     force->pair->virial[2] += opv2;
     force->pair->virial[3] += opv3;
     force->pair->virial[4] += opv4;
     force->pair->virial[5] += opv5;
   }
 
   fix->set_reduce_flag();
 }
 
 #endif
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralCharmmIntel::init_style()
 {
   DihedralCharmm::init_style();
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
                                            IntelBuffers<flt_t,acc_t> *buffers)
 {
 
   const int tp1 = atom->ntypes + 1;
   const int bp1 = atom->ndihedraltypes + 1;
   fc.set_ntypes(tp1,bp1,memory);
   buffers->set_ntypes(tp1);
 
   if (weightflag) {
-    for (int i = 0; i < tp1; i++) {
-      for (int j = 0; j < tp1; j++) {
+    for (int i = 1; i < tp1; i++) {
+      for (int j = 1; j < tp1; j++) {
         fc.ljp[i][j].lj1 = lj14_1[i][j];
         fc.ljp[i][j].lj2 = lj14_2[i][j];
         fc.ljp[i][j].lj3 = lj14_3[i][j];
         fc.ljp[i][j].lj4 = lj14_4[i][j];
       }
     }
   }
 
-  for (int i = 0; i < bp1; i++) {
+  for (int i = 1; i < bp1; i++) {
     fc.bp[i].multiplicity = multiplicity[i];
     fc.bp[i].cos_shift = cos_shift[i];
     fc.bp[i].sin_shift = sin_shift[i];
     fc.bp[i].k = k[i];
     fc.weight[i] = weight[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
                                                         const int nbondtypes,
                                                         Memory *memory) {
   if (npairtypes != _npairtypes) {
     if (_npairtypes > 0)
       _memory->destroy(ljp);
     if (npairtypes > 0)
       memory->create(ljp,npairtypes,npairtypes,"fc.ljp");
   }
 
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0) {
       _memory->destroy(bp);
       _memory->destroy(weight);
     }
 
     if (nbondtypes > 0) {
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
       _memory->create(weight,nbondtypes,"dihedralcharmmintel.weight");
     }
   }
   _npairtypes = npairtypes;
   _nbondtypes = nbondtypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/dihedral_harmonic_intel.cpp b/src/USER-INTEL/dihedral_harmonic_intel.cpp
index 196b024fa..5d16b0da7 100644
--- a/src/USER-INTEL/dihedral_harmonic_intel.cpp
+++ b/src/USER-INTEL/dihedral_harmonic_intel.cpp
@@ -1,425 +1,425 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <mpi.h>
 #include <math.h>
 #include "dihedral_harmonic_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "force.h"
 #include "pair.h"
 #include "update.h"
 #include "error.h"
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define PTOLERANCE (flt_t)1.05
 #define MTOLERANCE (flt_t)-1.05
 typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
 DihedralHarmonicIntel::DihedralHarmonicIntel(class LAMMPS *lmp)
   : DihedralHarmonic(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralHarmonicIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     DihedralHarmonic::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralHarmonicIntel::compute(int eflag, int vflag,
                                   IntelBuffers<flt_t,acc_t> *buffers,
                                   const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = 0;
 
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
         eval<0,1,1>(vflag, buffers, fc);
       else
         eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
         eval<1,1,1>(vflag, buffers, fc);
       else
         eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralHarmonicIntel::eval(const int vflag,
                                IntelBuffers<flt_t,acc_t> *buffers,
                                const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   if (EFLAG) oedihedral = (acc_t)0.0;
   if (VFLAG && vflag) {
     ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc)           \
     reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
     int nfrom, npl, nto, tid;
     #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
     #else
     IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
     #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int5_t * _noalias const dihedrallist =
       (int5_t *) neighbor->dihedrallist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     if (EFLAG) sedihedral = (acc_t)0.0;
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
     #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
     #endif
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
       const int i4 = dihedrallist[n].d;
       const int type = dihedrallist[n].t;
 
       // 1st bond
 
       const flt_t vb1x = x[i1].x - x[i2].x;
       const flt_t vb1y = x[i1].y - x[i2].y;
       const flt_t vb1z = x[i1].z - x[i2].z;
 
       // 2nd bond
 
       const flt_t vb2xm = x[i2].x - x[i3].x;
       const flt_t vb2ym = x[i2].y - x[i3].y;
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
 
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
 
       // c,s calculation
 
       const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
       const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
       const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
       const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
       const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
       const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
 
       const flt_t rasq = ax*ax + ay*ay + az*az;
       const flt_t rbsq = bx*bx + by*by + bz*bz;
       const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
       const flt_t rg = sqrt(rgsq);
 
       flt_t rginv, ra2inv, rb2inv;
       rginv = ra2inv = rb2inv = (flt_t)0.0;
       if (rg > 0) rginv = (flt_t)1.0/rg;
       if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
       if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
       const flt_t rabinv = sqrt(ra2inv*rb2inv);
 
       flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
       const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
 
       // error check
       #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
         int me = comm->me;
 
         if (screen) {
           char str[128];
           sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT,
                   me,tid,update->ntimestep,
                   atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
           error->warning(FLERR,str,0);
           fprintf(screen,"  1st atom: %d %g %g %g\n",
                   me,x[i1].x,x[i1].y,x[i1].z);
           fprintf(screen,"  2nd atom: %d %g %g %g\n",
                   me,x[i2].x,x[i2].y,x[i2].z);
           fprintf(screen,"  3rd atom: %d %g %g %g\n",
                   me,x[i3].x,x[i3].y,x[i3].z);
           fprintf(screen,"  4th atom: %d %g %g %g\n",
                   me,x[i4].x,x[i4].y,x[i4].z);
         }
       }
       #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       const flt_t tcos_shift = fc.bp[type].cos_shift;
       const flt_t tsin_shift = fc.bp[type].sin_shift;
       const flt_t tk = fc.bp[type].k;
       const int m = fc.bp[type].multiplicity;
 
       flt_t p = (flt_t)1.0;
       flt_t ddf1, df1;
       ddf1 = df1 = (flt_t)0.0;
 
       for (int i = 0; i < m; i++) {
         ddf1 = p*c - df1*s;
         df1 = p*s + df1*c;
         p = ddf1;
       }
 
       p = p*tcos_shift + df1*tsin_shift;
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 *= -m;
       p += (flt_t)1.0;
 
       if (m == 0) {
         p = (flt_t)1.0 + tcos_shift;
         df1 = (flt_t)0.0;
       }
 
       const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
       const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
       const flt_t fga = fg*ra2inv*rginv;
       const flt_t hgb = hg*rb2inv*rginv;
       const flt_t gaa = -ra2inv*rg;
       const flt_t gbb = rb2inv*rg;
 
       const flt_t dtfx = gaa*ax;
       const flt_t dtfy = gaa*ay;
       const flt_t dtfz = gaa*az;
       const flt_t dtgx = fga*ax - hgb*bx;
       const flt_t dtgy = fga*ay - hgb*by;
       const flt_t dtgz = fga*az - hgb*bz;
       const flt_t dthx = gbb*bx;
       const flt_t dthy = gbb*by;
       const flt_t dthz = gbb*bz;
 
       const flt_t df = -tk * df1;
 
       const flt_t sx2 = df*dtgx;
       const flt_t sy2 = df*dtgy;
       const flt_t sz2 = df*dtgz;
 
       flt_t f1x = df*dtfx;
       flt_t f1y = df*dtfy;
       flt_t f1z = df*dtfz;
 
       const flt_t f2x = sx2 - f1x;
       const flt_t f2y = sy2 - f1y;
       const flt_t f2z = sz2 - f1z;
 
       flt_t f4x = df*dthx;
       flt_t f4y = df*dthy;
       flt_t f4z = df*dthz;
 
       const flt_t f3x = -sx2 - f4x;
       const flt_t f3y = -sy2 - f4y;
       const flt_t f3z = -sz2 - f4z;
 
       if (EFLAG || VFLAG) {
         flt_t deng;
         if (EFLAG) deng = tk * p;
         #ifdef LMP_INTEL_USE_SIMDOFF
         IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
                               f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
                               vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
                               vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
                               sv0, sv1, sv2, sv3, sv4, sv5);
         #else
         IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
                               f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
                               vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
                               vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
                               ov0, ov1, ov2, ov3, ov4, ov5);
         #endif
       }
 
       #ifdef LMP_INTEL_USE_SIMDOFF
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += f1x;
           f[i1].y += f1y;
           f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x += f2x;
           f[i2].y += f2y;
           f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
           f[i3].x += f3x;
           f[i3].y += f3y;
           f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
           f[i4].x += f4x;
           f[i4].y += f4y;
           f[i4].z += f4z;
         }
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
     if (EFLAG) oedihedral += sedihedral;
     if (VFLAG && vflag) {
         ov0 += sv0; ov1 += sv1; ov2 += sv2;
         ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
 
   if (EFLAG) energy += oedihedral;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
     virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralHarmonicIntel::init_style()
 {
   DihedralHarmonic::init_style();
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
                                              IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->ndihedraltypes + 1;
   fc.set_ntypes(bp1,memory);
 
-  for (int i = 0; i < bp1; i++) {
+  for (int i = 1; i < bp1; i++) {
     fc.bp[i].multiplicity = multiplicity[i];
     fc.bp[i].cos_shift = cos_shift[i];
     fc.bp[i].sin_shift = sin_shift[i];
     fc.bp[i].k = k[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void DihedralHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
                                                           Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(bp);
 
     if (nbondtypes > 0)
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
   }
   _nbondtypes = nbondtypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/dihedral_opls_intel.cpp b/src/USER-INTEL/dihedral_opls_intel.cpp
index 1abeba1d5..e290ab906 100644
--- a/src/USER-INTEL/dihedral_opls_intel.cpp
+++ b/src/USER-INTEL/dihedral_opls_intel.cpp
@@ -1,452 +1,452 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <mpi.h>
 #include <math.h>
 #include "dihedral_opls_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "memory.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "force.h"
 #include "pair.h"
 #include "update.h"
 #include "error.h"
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define PTOLERANCE (flt_t)1.05
 #define MTOLERANCE (flt_t)-1.05
 #define SMALL2     (flt_t)0.000001
 #define INVSMALL   (flt_t)1000.0
 #define SMALLER2   (flt_t)0.0000000001
 #define INVSMALLER (flt_t)100000.0
 typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
 DihedralOPLSIntel::DihedralOPLSIntel(class LAMMPS *lmp)
   : DihedralOPLS(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralOPLSIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     DihedralOPLS::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralOPLSIntel::compute(int eflag, int vflag,
                                   IntelBuffers<flt_t,acc_t> *buffers,
                                   const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = 0;
 
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
         eval<0,1,1>(vflag, buffers, fc);
       else
         eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
         eval<1,1,1>(vflag, buffers, fc);
       else
         eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralOPLSIntel::eval(const int vflag,
                                IntelBuffers<flt_t,acc_t> *buffers,
                                const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   if (EFLAG) oedihedral = (acc_t)0.0;
   if (VFLAG && vflag) {
     ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc)           \
     reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
     int nfrom, npl, nto, tid;
     #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
     #else
     IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
     #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int5_t * _noalias const dihedrallist =
       (int5_t *) neighbor->dihedrallist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     if (EFLAG) sedihedral = (acc_t)0.0;
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
     #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n ++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
     #endif
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
       const int i4 = dihedrallist[n].d;
       const int type = dihedrallist[n].t;
 
       // 1st bond
 
       const flt_t vb1x = x[i1].x - x[i2].x;
       const flt_t vb1y = x[i1].y - x[i2].y;
       const flt_t vb1z = x[i1].z - x[i2].z;
 
       // 2nd bond
 
       const flt_t vb2xm = x[i2].x - x[i3].x;
       const flt_t vb2ym = x[i2].y - x[i3].y;
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
 
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
 
       // 1-4
 
       const flt_t delx = x[i1].x - x[i4].x;
       const flt_t dely = x[i1].y - x[i4].y;
       const flt_t delz = x[i1].z - x[i4].z;
 
 
       // c0 calculation
       // 1st and 2nd angle
 
       const flt_t b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
       const flt_t rb1 = (flt_t)1.0 / sqrt(b1mag2);
       const flt_t sb1 = (flt_t)1.0 / b1mag2;
 
       const flt_t b2mag2 = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
       const flt_t rb2 = (flt_t)1.0 / sqrt(b2mag2);
       const flt_t sb2 = (flt_t)1.0 / b2mag2;
 
       const flt_t b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
       const flt_t rb3 = (flt_t)1.0 / sqrt(b3mag2);
       const flt_t sb3 = (flt_t)1.0 / b3mag2;
 
       const flt_t c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
 
       flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm;
       const flt_t r12c1 =  rb1 * rb2;
       const flt_t c1mag = ctmp * r12c1;
 
       ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
       const flt_t r12c2 =  rb2 * rb3;
       const flt_t c2mag = ctmp * r12c2;
 
       // cos and sin of 2 angles and final c
 
       flt_t sin2 = MAX((flt_t)1.0 - c1mag*c1mag,(flt_t)0.0);
       flt_t sc1 = (flt_t)1.0/sqrt(sin2);
       if (sin2 < SMALL2) sc1 = INVSMALL;
 
       sin2 = MAX((flt_t)1.0 - c2mag*c2mag,(flt_t)0.0);
       flt_t sc2 = (flt_t)1.0/sqrt(sin2);
       if (sin2 < SMALL2) sc2 = INVSMALL;
 
       const flt_t s1 = sc1 * sc1;
       const flt_t s2 = sc2 * sc2;
       flt_t s12 = sc1 * sc2;
       flt_t c = (c0 + c1mag*c2mag) * s12;
 
       const flt_t cx = vb1z*vb2ym - vb1y*vb2zm;
       const flt_t cy = vb1x*vb2zm - vb1z*vb2xm;
       const flt_t cz = vb1y*vb2xm - vb1x*vb2ym;
       const flt_t cmag = (flt_t)1.0/sqrt(cx*cx + cy*cy + cz*cz);
       const flt_t dx = (cx*vb3x + cy*vb3y + cz*vb3z)*cmag*rb3;
 
       // error check
       #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
         int me = comm->me;
 
         if (screen) {
           char str[128];
           sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT,
                   me,tid,update->ntimestep,
                   atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
           error->warning(FLERR,str,0);
           fprintf(screen,"  1st atom: %d %g %g %g\n",
                   me,x[i1].x,x[i1].y,x[i1].z);
           fprintf(screen,"  2nd atom: %d %g %g %g\n",
                   me,x[i2].x,x[i2].y,x[i2].z);
           fprintf(screen,"  3rd atom: %d %g %g %g\n",
                   me,x[i3].x,x[i3].y,x[i3].z);
           fprintf(screen,"  4th atom: %d %g %g %g\n",
                   me,x[i4].x,x[i4].y,x[i4].z);
         }
       }
       #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       // force & energy
       // p = sum (i=1,4) k_i * (1 + (-1)**(i+1)*cos(i*phi) )
       // pd = dp/dc
 
       const flt_t cossq = c * c;
       const flt_t sinsq = (flt_t)1.0 - cossq;
       flt_t siinv = (flt_t)1.0/sqrt(sinsq);
       if (sinsq < SMALLER2 ) siinv = INVSMALLER;
       if (dx < (flt_t)0.0) siinv = -siinv;
 
       const flt_t cos_2phi = cossq - sinsq;
       const flt_t sin_2phim = (flt_t)2.0 * c;
       const flt_t cos_3phi = (flt_t)2.0 * c * cos_2phi - c;
       const flt_t sin_3phim = (flt_t)2.0 * cos_2phi + (flt_t)1.0;
       const flt_t cos_4phi = (flt_t)2.0 * cos_2phi * cos_2phi - (flt_t)1.0;
       const flt_t sin_4phim = (flt_t)2.0 * cos_2phi * sin_2phim;
 
       flt_t p, pd;
       p = fc.bp[type].k1*((flt_t)1.0 + c) +
           fc.bp[type].k2*((flt_t)1.0 - cos_2phi) +
           fc.bp[type].k3*((flt_t)1.0 + cos_3phi) +
           fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ;
       pd = fc.bp[type].k1 -
            (flt_t)2.0 * fc.bp[type].k2 * sin_2phim +
            (flt_t)3.0 * fc.bp[type].k3 * sin_3phim -
            (flt_t)4.0 * fc.bp[type].k4 * sin_4phim;
 
       flt_t edihed;
       if (EFLAG) edihed = p;
 
       const flt_t a = pd;
       c = c * a;
       s12 = s12 * a;
       const flt_t a11 = c*sb1*s1;
       const flt_t a22 = -sb2 * ((flt_t)2.0*c0*s12 - c*(s1+s2));
       const flt_t a33 = c*sb3*s2;
       const flt_t a12 = -r12c1 * (c1mag*c*s1 + c2mag*s12);
       const flt_t a13 = -rb1*rb3*s12;
       const flt_t a23 = r12c2 * (c2mag*c*s2 + c1mag*s12);
 
       const flt_t sx2  = a12*vb1x - a22*vb2xm + a23*vb3x;
       const flt_t sy2  = a12*vb1y - a22*vb2ym + a23*vb3y;
       const flt_t sz2  = a12*vb1z - a22*vb2zm + a23*vb3z;
 
       const flt_t f1x = a11*vb1x - a12*vb2xm + a13*vb3x;
       const flt_t f1y = a11*vb1y - a12*vb2ym + a13*vb3y;
       const flt_t f1z = a11*vb1z - a12*vb2zm + a13*vb3z;
 
       const flt_t f2x = -sx2 - f1x;
       const flt_t f2y = -sy2 - f1y;
       const flt_t f2z = -sz2 - f1z;
 
       const flt_t f4x = a13*vb1x - a23*vb2xm + a33*vb3x;
       const flt_t f4y = a13*vb1y - a23*vb2ym + a33*vb3y;
       const flt_t f4z = a13*vb1z - a23*vb2zm + a33*vb3z;
 
       const flt_t f3x = sx2 - f4x;
       const flt_t f3y = sy2 - f4y;
       const flt_t f3z = sz2 - f4z;
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
         IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
                               i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
                               vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
                               vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
                               sv0, sv1, sv2, sv3, sv4, sv5);
         #else
         IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
                               i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
                               vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
                               vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
                               ov0, ov1, ov2, ov3, ov4, ov5);
         #endif
       }
 
       #ifdef LMP_INTEL_USE_SIMDOFF
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += f1x;
           f[i1].y += f1y;
           f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x += f2x;
           f[i2].y += f2y;
           f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
           f[i3].x += f3x;
           f[i3].y += f3y;
           f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
           f[i4].x += f4x;
           f[i4].y += f4y;
           f[i4].z += f4z;
         }
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
     if (EFLAG) oedihedral += sedihedral;
     if (VFLAG && vflag) {
         ov0 += sv0; ov1 += sv1; ov2 += sv2;
         ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
 
   if (EFLAG) energy += oedihedral;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
     virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void DihedralOPLSIntel::init_style()
 {
   DihedralOPLS::init_style();
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
                                              IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->ndihedraltypes + 1;
   fc.set_ntypes(bp1,memory);
 
-  for (int i = 0; i < bp1; i++) {
+  for (int i = 1; i < bp1; i++) {
     fc.bp[i].k1 = k1[i];
     fc.bp[i].k2 = k2[i];
     fc.bp[i].k3 = k3[i];
     fc.bp[i].k4 = k4[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void DihedralOPLSIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
                                                           Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(bp);
 
     if (nbondtypes > 0)
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
   }
   _nbondtypes = nbondtypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/improper_cvff_intel.cpp b/src/USER-INTEL/improper_cvff_intel.cpp
index dc9765d91..39090e5a7 100644
--- a/src/USER-INTEL/improper_cvff_intel.cpp
+++ b/src/USER-INTEL/improper_cvff_intel.cpp
@@ -1,457 +1,457 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <mpi.h>
 #include <math.h>
 #include <stdlib.h>
 #include "improper_cvff_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "force.h"
 #include "update.h"
 #include "math_const.h"
 #include "memory.h"
 #include "modify.h"
 #include "suffix.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define PTOLERANCE (flt_t)1.05
 #define MTOLERANCE (flt_t)-1.05
 #define SMALL2     (flt_t)0.000001
 #define INVSMALL   (flt_t)1000.0
 typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
 ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) :
   ImproperCvff(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 ImproperCvffIntel::~ImproperCvffIntel()
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ImproperCvffIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     ImproperCvff::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void ImproperCvffIntel::compute(int eflag, int vflag,
                                     IntelBuffers<flt_t,acc_t> *buffers,
                                     const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
         eval<0,1,1>(vflag, buffers, fc);
       else
         eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
         eval<1,1,1>(vflag, buffers, fc);
       else
         eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void ImproperCvffIntel::eval(const int vflag,
                                  IntelBuffers<flt_t,acc_t> *buffers,
                                  const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nimproperlist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
   if (EFLAG) oeimproper = (acc_t)0.0;
   if (VFLAG && vflag) {
     ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc) \
     reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
     int nfrom, npl, nto, tid;
     #ifdef LMP_INTEL_USE_SIMDOFF_FIX
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
     #else
     IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
     #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int5_t * _noalias const improperlist =
       (int5_t *) neighbor->improperlist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF_FIX
     acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5;
     if (EFLAG) seimproper = (acc_t)0.0;
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
     #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
     #endif
       const int i1 = improperlist[n].a;
       const int i2 = improperlist[n].b;
       const int i3 = improperlist[n].c;
       const int i4 = improperlist[n].d;
       const int type = improperlist[n].t;
 
       // geometry of 4-body
 
       const flt_t vb1x = x[i1].x - x[i2].x;
       const flt_t vb1y = x[i1].y - x[i2].y;
       const flt_t vb1z = x[i1].z - x[i2].z;
 
       const flt_t vb2xm = x[i2].x - x[i3].x;
       const flt_t vb2ym = x[i2].y - x[i3].y;
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
 
       // 1st and 2nd angle
 
       const flt_t b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
       const flt_t rb1 = (flt_t)1.0 / sqrt(b1mag2);
       const flt_t sb1 = (flt_t)1.0 / b1mag2;
 
       const flt_t b2mag2 = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
       const flt_t rb2 = (flt_t)1.0 / sqrt(b2mag2);
       const flt_t sb2 = (flt_t)1.0 / b2mag2;
 
       const flt_t b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
       const flt_t rb3 = (flt_t)1.0 / sqrt(b3mag2);
       const flt_t sb3 = (flt_t)1.0 / b3mag2;
 
       const flt_t c0 = (vb1x * vb3x + vb1y * vb3y + vb1z * vb3z) * rb1 * rb3;
 
       flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm;
       const flt_t r12c1 = rb1 * rb2;
       const flt_t c1mag = ctmp * r12c1;
 
       ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
       const flt_t r12c2 = rb2 * rb3;
       const flt_t c2mag = ctmp * r12c2;
 
       // cos and sin of 2 angles and final c
 
       const flt_t sd1 = (flt_t)1.0 - c1mag * c1mag;
       flt_t sc1 = (flt_t)1.0/sqrt(sd1);
       if (sd1 < SMALL2) sc1 = INVSMALL;
 
       const flt_t sd2 = (flt_t)1.0 - c2mag * c2mag;
       flt_t sc2 = (flt_t)1.0/sqrt(sd2);
       if (sc2 < SMALL2) sc2 = INVSMALL;
 
       const flt_t s1 = sc1 * sc1;
       const flt_t s2 = sc2 * sc2;
       flt_t s12 = sc1 * sc2;
       flt_t c = (c0 + c1mag*c2mag) * s12;
 
       // error check
       #ifndef LMP_INTEL_USE_SIMDOFF_FIX
       if (c > PTOLERANCE || c < MTOLERANCE) {
         int me;
         MPI_Comm_rank(world,&me);
         if (screen) {
           char str[128];
           sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT,
                   me,update->ntimestep,
                   atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
           error->warning(FLERR,str,0);
           fprintf(screen,"  1st atom: %d %g %g %g\n",
                   me,x[i1].x,x[i1].y,x[i1].z);
           fprintf(screen,"  2nd atom: %d %g %g %g\n",
                   me,x[i2].x,x[i2].y,x[i2].z);
           fprintf(screen,"  3rd atom: %d %g %g %g\n",
                   me,x[i3].x,x[i3].y,x[i3].z);
           fprintf(screen,"  4th atom: %d %g %g %g\n",
                   me,x[i4].x,x[i4].y,x[i4].z);
         }
       }
       #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       // force & energy
       // p = 1 + cos(n*phi) for d = 1
       // p = 1 - cos(n*phi) for d = -1
       // pd = dp/dc / 2
 
       const int m = fc.fc[type].multiplicity;
 
       flt_t p, pd;
       #ifdef LMP_INTEL_USE_SIMDOFF_FIX
       #pragma simdoff
       #endif
       {
         if (m == 2) {
           p = (flt_t)2.0*c*c;
           pd = (flt_t)2.0*c;
         } else if (m == 3) {
           const flt_t rc2 = c*c;
           p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
           pd = (flt_t)6.0*rc2 - (flt_t)1.5;
         } else if (m == 4) {
           const flt_t rc2 = c*c;
           p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
           pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
         } else if (m == 6) {
           const flt_t rc2 = c*c;
           p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
           pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
         } else if (m == 1) {
           p = c + (flt_t)1.0;
           pd = (flt_t)0.5;
         } else if (m == 5) {
           const flt_t rc2 = c*c;
           p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
           pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
         } else if (m == 0) {
           p = (flt_t)2.0;
           pd = (flt_t)0.0;
         }
       }
 
       if (fc.fc[type].sign == -1) {
         p = (flt_t)2.0 - p;
         pd = -pd;
       }
 
       flt_t eimproper;
       if (EFLAG) eimproper = fc.fc[type].k * p;
 
       const flt_t a = (flt_t)2.0 * fc.fc[type].k * pd;
       c = c * a;
       s12 = s12 * a;
       const flt_t a11 = c*sb1*s1;
       const flt_t a22 = -sb2*((flt_t)2.0*c0*s12 - c*(s1+s2));
       const flt_t a33 = c*sb3*s2;
       const flt_t a12 = -r12c1*(c1mag*c*s1 + c2mag*s12);
       const flt_t a13 = -rb1*rb3*s12;
       const flt_t a23 = r12c2*(c2mag*c*s2 + c1mag*s12);
 
       const flt_t sx2  = a12*vb1x - a22*vb2xm + a23*vb3x;
       const flt_t sy2  = a12*vb1y - a22*vb2ym + a23*vb3y;
       const flt_t sz2  = a12*vb1z - a22*vb2zm + a23*vb3z;
 
       const flt_t f1x = a11*vb1x - a12*vb2xm + a13*vb3x;
       const flt_t f1y = a11*vb1y - a12*vb2ym + a13*vb3y;
       const flt_t f1z = a11*vb1z - a12*vb2zm + a13*vb3z;
 
       const flt_t f2x = -sx2 - f1x;
       const flt_t f2y = -sy2 - f1y;
       const flt_t f2z = -sz2 - f1z;
 
       const flt_t f4x = a13*vb1x - a23*vb2xm + a33*vb3x;
       const flt_t f4y = a13*vb1y - a23*vb2ym + a33*vb3y;
       const flt_t f4z = a13*vb1z - a23*vb2zm + a33*vb3z;
 
       const flt_t f3x = sx2 - f4x;
       const flt_t f3y = sy2 - f4y;
       const flt_t f3z = sz2 - f4z;
 
       // apply force to each of 4 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF_FIX
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += f1x;
           f[i1].y += f1y;
           f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x += f2x;
           f[i2].y += f2y;
           f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
           f[i3].x += f3x;
           f[i3].y += f3y;
           f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
           f[i4].x += f4x;
           f[i4].y += f4y;
           f[i4].z += f4z;
         }
       }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF_FIX
         IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
                               i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
                               f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
                               vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
                               nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
         #else
         IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
                               i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
                               f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
                               vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
                               nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
         #endif
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF_FIX
     if (EFLAG) oeimproper += seimproper;
     if (VFLAG && vflag) {
       ov0 += sv0; ov1 += sv1; ov2 += sv2;
       ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
   if (EFLAG) energy += oeimproper;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
     virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ImproperCvffIntel::init_style()
 {
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
                                              IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->nimpropertypes + 1;
   fc.set_ntypes(bp1,memory);
 
-  for (int i = 0; i < bp1; i++) {
+  for (int i = 1; i < bp1; i++) {
     fc.fc[i].k = k[i];
     fc.fc[i].sign = sign[i];
     fc.fc[i].multiplicity = multiplicity[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void ImproperCvffIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
                                                           Memory *memory) {
   if (nimproper != _nimpropertypes) {
     if (_nimpropertypes > 0)
       _memory->destroy(fc);
 
     if (nimproper > 0)
       _memory->create(fc,nimproper,"improperharmonicintel.fc");
   }
   _nimpropertypes = nimproper;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/improper_harmonic_intel.cpp b/src/USER-INTEL/improper_harmonic_intel.cpp
index fe0efca5e..354706167 100644
--- a/src/USER-INTEL/improper_harmonic_intel.cpp
+++ b/src/USER-INTEL/improper_harmonic_intel.cpp
@@ -1,412 +1,412 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <mpi.h>
 #include <math.h>
 #include <stdlib.h>
 #include "improper_harmonic_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "neighbor.h"
 #include "domain.h"
 #include "force.h"
 #include "update.h"
 #include "math_const.h"
 #include "memory.h"
 #include "modify.h"
 #include "suffix.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define PTOLERANCE (flt_t)1.05
 #define MTOLERANCE (flt_t)-1.05
 #define SMALL     (flt_t)0.001
 #define SMALL2     (flt_t)0.000001
 #define INVSMALL   (flt_t)1000.0
 typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
 ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) :
   ImproperHarmonic(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 ImproperHarmonicIntel::~ImproperHarmonicIntel()
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ImproperHarmonicIntel::compute(int eflag, int vflag)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_use_base) {
     ImproperHarmonic::compute(eflag, vflag);
     return;
   }
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void ImproperHarmonicIntel::compute(int eflag, int vflag,
                                     IntelBuffers<flt_t,acc_t> *buffers,
                                     const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
         eval<0,1,1>(vflag, buffers, fc);
       else
         eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
         eval<1,1,1>(vflag, buffers, fc);
       else
         eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
       eval<0,0,1>(vflag, buffers, fc);
     else
       eval<0,0,0>(vflag, buffers, fc);
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void ImproperHarmonicIntel::eval(const int vflag,
                                  IntelBuffers<flt_t,acc_t> *buffers,
                                  const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nimproperlist;
   if (inum == 0) return;
 
   ATOM_T * _noalias const x = buffers->get_x(0);
   const int nlocal = atom->nlocal;
   const int nall = nlocal + atom->nghost;
 
   int f_stride;
   if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
   else f_stride = buffers->get_stride(nlocal);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
 
   acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
   if (EFLAG) oeimproper = (acc_t)0.0;
   if (VFLAG && vflag) {
     ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc) \
     reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
     int nfrom, npl, nto, tid;
     #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
     #else
     IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
     #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
     const int5_t * _noalias const improperlist =
       (int5_t *) neighbor->improperlist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
     acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5;
     if (EFLAG) seimproper = (acc_t)0.0;
     if (VFLAG && vflag) {
       sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
     #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n++) {
     #else
     for (int n = nfrom; n < nto; n += npl) {
     #endif
       const int i1 = improperlist[n].a;
       const int i2 = improperlist[n].b;
       const int i3 = improperlist[n].c;
       const int i4 = improperlist[n].d;
       const int type = improperlist[n].t;
 
       // geometry of 4-body
 
       const flt_t vb1x = x[i1].x - x[i2].x;
       const flt_t vb1y = x[i1].y - x[i2].y;
       const flt_t vb1z = x[i1].z - x[i2].z;
 
       const flt_t vb2x = x[i3].x - x[i2].x;
       const flt_t vb2y = x[i3].y - x[i2].y;
       const flt_t vb2z = x[i3].z - x[i2].z;
 
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
 
       flt_t ss1 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
       flt_t ss2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
       flt_t ss3 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
 
       const flt_t r1 = (flt_t)1.0 / sqrt(ss1);
       const flt_t r2 = (flt_t)1.0 / sqrt(ss2);
       const flt_t r3 = (flt_t)1.0 / sqrt(ss3);
 
       ss1 = (flt_t)1.0 / ss1;
       ss2 = (flt_t)1.0 / ss2;
       ss3 = (flt_t)1.0 / ss3;
 
       // sin and cos of angle
 
       const flt_t c0 = (vb1x * vb3x + vb1y * vb3y + vb1z * vb3z) * r1 * r3;
       const flt_t c1 = (vb1x * vb2x + vb1y * vb2y + vb1z * vb2z) * r1 * r2;
       const flt_t c2 = -(vb3x * vb2x + vb3y * vb2y + vb3z * vb2z) * r3 * r2;
 
       flt_t s1 = 1.0 - c1*c1;
       if (s1 < SMALL) s1 = SMALL;
 
       flt_t s2 = (flt_t)1.0 - c2*c2;
       if (s2 < SMALL) s2 = SMALL;
 
       flt_t s12 = (flt_t)1.0 / sqrt(s1*s2);
       s1 = (flt_t)1.0 / s1;
       s2 = (flt_t)1.0 / s2;
       flt_t c = (c1*c2 + c0) * s12;
 
       // error check
       #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
         int me;
         MPI_Comm_rank(world,&me);
         if (screen) {
           char str[128];
           sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT,
                   me,update->ntimestep,
                   atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
           error->warning(FLERR,str,0);
           fprintf(screen,"  1st atom: %d %g %g %g\n",
                   me,x[i1].x,x[i1].y,x[i1].z);
           fprintf(screen,"  2nd atom: %d %g %g %g\n",
                   me,x[i2].x,x[i2].y,x[i2].z);
           fprintf(screen,"  3rd atom: %d %g %g %g\n",
                   me,x[i3].x,x[i3].y,x[i3].z);
           fprintf(screen,"  4th atom: %d %g %g %g\n",
                   me,x[i4].x,x[i4].y,x[i4].z);
         }
       }
       #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
 
       const flt_t sd = (flt_t)1.0 - c * c;
       flt_t s = (flt_t)1.0 / sqrt(sd);
       if (sd < SMALL2) s = INVSMALL;
 
       // force & energy
 
       const flt_t domega = acos(c) - fc.fc[type].chi;
       flt_t a;
       a = fc.fc[type].k * domega;
 
       flt_t eimproper;
       if (EFLAG) eimproper = a*domega;
 
       a = -a * (flt_t)2.0 * s;
       c = c * a;
       s12 = s12 * a;
       const flt_t a11 = c*ss1*s1;
       const flt_t a22 = -ss2 * ((flt_t)2.0*c0*s12 - c*(s1+s2));
       const flt_t a33 = c*ss3*s2;
       const flt_t a12 = -r1*r2*(c1*c*s1 + c2*s12);
       const flt_t a13 = -r1*r3*s12;
       const flt_t a23 = r2*r3*(c2*c*s2 + c1*s12);
 
       const flt_t sx2  = a22*vb2x + a23*vb3x + a12*vb1x;
       const flt_t sy2  = a22*vb2y + a23*vb3y + a12*vb1y;
       const flt_t sz2  = a22*vb2z + a23*vb3z + a12*vb1z;
 
       const flt_t f1x = a12*vb2x + a13*vb3x + a11*vb1x;
       const flt_t f1y = a12*vb2y + a13*vb3y + a11*vb1y;
       const flt_t f1z = a12*vb2z + a13*vb3z + a11*vb1z;
 
       const flt_t f2x = -sx2 - f1x;
       const flt_t f2y = -sy2 - f1y;
       const flt_t f2z = -sz2 - f1z;
 
       const flt_t f4x = a23*vb2x + a33*vb3x + a13*vb1x;
       const flt_t f4y = a23*vb2y + a33*vb3y + a13*vb1y;
       const flt_t f4z = a23*vb2z + a33*vb3z + a13*vb1z;
 
       const flt_t f3x = sx2 - f4x;
       const flt_t f3y = sy2 - f4y;
       const flt_t f3z = sz2 - f4z;
 
       // apply force to each of 4 atoms
 
       #ifdef LMP_INTEL_USE_SIMDOFF
       #pragma simdoff
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += f1x;
           f[i1].y += f1y;
           f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x += f2x;
           f[i2].y += f2y;
           f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
           f[i3].x += f3x;
           f[i3].y += f3y;
           f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
           f[i4].x += f4x;
           f[i4].y += f4y;
           f[i4].z += f4z;
         }
       }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
         IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
                               i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
                               f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
                               vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
                               nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
         #else
         IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
                               i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
                               f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
                               vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
                               nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
         #endif
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
     if (EFLAG) oeimproper += seimproper;
     if (VFLAG && vflag) {
       ov0 += sv0; ov1 += sv1; ov2 += sv2;
       ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
   if (EFLAG) energy += oeimproper;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
     virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
 }
 
 /* ---------------------------------------------------------------------- */
 
 void ImproperHarmonicIntel::init_style()
 {
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   #ifdef _LMP_INTEL_OFFLOAD
   _use_base = 0;
   if (fix->offload_balance() != 0.0) {
     _use_base = 1;
     return;
   }
   #endif
 
   fix->bond_init_check();
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
                                              IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->nimpropertypes + 1;
   fc.set_ntypes(bp1,memory);
 
-  for (int i = 0; i < bp1; i++) {
+  for (int i = 1; i < bp1; i++) {
     fc.fc[i].k = k[i];
     fc.fc[i].chi = chi[i];
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void ImproperHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
                                                           Memory *memory) {
   if (nimproper != _nimpropertypes) {
     if (_nimpropertypes > 0)
       _memory->destroy(fc);
 
     if (nimproper > 0)
       _memory->create(fc,nimproper,"improperharmonicintel.fc");
   }
   _nimpropertypes = nimproper;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index d49d0d8b0..068f61023 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -1,885 +1,891 @@
 /* -*- c++ -*- -------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #ifdef __INTEL_COMPILER
 #define LMP_SIMD_COMPILER
 #if (__INTEL_COMPILER_BUILD_DATE > 20160720)
 #define LMP_INTEL_USE_SIMDOFF
 #endif
 #endif
 
 #ifdef __INTEL_OFFLOAD
 #ifdef LMP_INTEL_OFFLOAD
 #define _LMP_INTEL_OFFLOAD
 #ifdef __TARGET_ARCH_MIC
 #ifndef __MIC__
 #define __MIC__ 1
 #endif
 #endif
 #endif
 #endif
 
 #ifndef LMP_INTEL_PREPROCESS_H
 #define LMP_INTEL_PREPROCESS_H
 
 // LAMMPS_MEMALIGN is set to 64 by default for -DLMP_USER_INTEL
 // so we only need to error out in case of a different alignment
 #if LAMMPS_MEMALIGN && (LAMMPS_MEMALIGN != 64)
 #error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS of your LAMMPS makefile for USER-INTEL package
 #endif
 
 #if defined(_OPENMP)
 #define _use_omp_pragma(txt) _Pragma(txt)
 #else
 #define _use_omp_pragma(txt)
 #endif
 
 #if defined(LMP_SIMD_COMPILER)
 #define _use_simd_pragma(txt) _Pragma(txt)
 #else
 #define _use_simd_pragma(txt)
 #endif
 
 namespace LAMMPS_NS {
 
 enum {LMP_OVERFLOW, LMP_LOCAL_MIN, LMP_LOCAL_MAX, LMP_GHOST_MIN,
       LMP_GHOST_MAX};
 enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
       TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY,
       TIME_IMBALANCE};
 
 #define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
 #define INTEL_MIC_VECTOR_WIDTH 16
 #define INTEL_VECTOR_WIDTH 4
 #define INTEL_MAX_STENCIL 256
 // INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
 #define INTEL_MAX_STENCIL_CHECK 4096
 #define INTEL_P3M_MAXORDER 8
 #define INTEL_P3M_ALIGNED_MAXORDER 8
 // PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
 #define INTEL_P3M_TABLE 1
 
 #ifdef __INTEL_COMPILER
 #ifdef __AVX__
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 8
 #endif
 
 #ifdef __AVX2__
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 8
 #endif
 
 #ifdef __AVX512F__
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 16
 #define INTEL_V512 1
 #define INTEL_VMASK 1
 #else
 #ifdef __MIC__
 #define INTEL_V512 1
 #define INTEL_VMASK 1
 #define INTEL_HTHREADS 4
 #endif
 #endif
 
 #ifdef __AVX512ER__
 #define INTEL_HTHREADS 4
 #endif
 
 #ifdef __AVX512CD__
 #ifndef _LMP_INTEL_OFFLOAD
 #define LMP_USE_AVXCD
 #endif
 #endif
 
 #ifdef __MIC__
 #define INTEL_COMPILE_WIDTH INTEL_MIC_VECTOR_WIDTH
 #else
 #define INTEL_COMPILE_WIDTH INTEL_VECTOR_WIDTH
 #endif
 
 #else
 
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 1
 #define INTEL_COMPILE_WIDTH 1
 
 #endif
 
 #define INTEL_DATA_ALIGN 64
 #define INTEL_ONEATOM_FACTOR 1
 #define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH
 #define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
 #define INTEL_LB_MEAN_WEIGHT 0.1
 #define INTEL_BIGP 1e15
 #define INTEL_MAX_HOST_CORE_COUNT 512
 #define INTEL_MAX_COI_CORES 36
 
 #ifndef INTEL_HTHREADS
 #define INTEL_HTHREADS 2
 #endif
 
 #define IP_PRE_get_stride(stride, n, datasize, torque)  \
   {                                                             \
     int blength = n;                                            \
     if (torque) blength *= 2;                                   \
     const int bytes = blength * datasize;                       \
     stride = INTEL_DATA_ALIGN - (bytes % INTEL_DATA_ALIGN);     \
     stride = blength + stride / datasize;                       \
   }
 
 #if defined(_OPENMP)
 
 #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)       \
   {                                                             \
     int idelta = inum/nthreads;                                 \
     const int imod = inum % nthreads;                           \
     ifrom = tid * idelta;                                       \
     ito = ifrom + idelta;                                       \
     if (tid < imod) {                                           \
       ito+=tid+1;                                               \
       ifrom+=tid;                                               \
     } else {                                                    \
       ito+=imod;                                                \
       ifrom+=imod;                                              \
     }                                                           \
   }
 
 #define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)    \
   {                                                             \
     tid = omp_get_thread_num();                                 \
     IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads);          \
   }
 
 #define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr)      \
   {                                                             \
     if (nthr <= INTEL_HTHREADS) {                               \
       ifrom = tid;                                              \
       ito = inum;                                               \
       ip = nthr;                                                \
     } else if (nthr % INTEL_HTHREADS == 0) {                    \
       int nd = nthr / INTEL_HTHREADS;                           \
       int td = tid / INTEL_HTHREADS;                            \
       int tm = tid % INTEL_HTHREADS;                            \
       IP_PRE_omp_range(ifrom, ito, td, inum, nd);               \
       ifrom += tm;                                              \
       ip = INTEL_HTHREADS;                                      \
     } else {                                                    \
       IP_PRE_omp_range(ifrom, ito, tid, inum, nthr);            \
       ip = 1;                                                   \
     }                                                           \
   }
 
 #define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)   \
   {                                                             \
     tid = omp_get_thread_num();                                 \
     IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr);         \
   }
 
 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
                              datasize)                          \
 {                                                               \
   int chunk_size = INTEL_DATA_ALIGN / datasize;                 \
   int idelta = static_cast<int>(ceil(static_cast<float>(inum)   \
                                      /chunk_size/nthreads));    \
   idelta *= chunk_size;                                         \
   ifrom = tid*idelta;                                           \
   ito = ifrom + idelta;                                         \
   if (ito > inum) ito = inum;                                   \
 }
 
 #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
                                 nthreads, datasize)             \
   {                                                             \
     tid = omp_get_thread_num();                                 \
     IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads,     \
                            datasize);                           \
   }
 
-#define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum,          \
-                                nthreads, vecsize)              \
+#define IP_PRE_omp_range_vec(ifrom, ito, tid, inum, nthreads,	\
+                             vecsize)				\
   {                                                             \
-    tid = omp_get_thread_num();                                 \
     int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
                                        /vecsize/nthreads));     \
     idelta *= vecsize;                                          \
     ifrom = tid*idelta;                                         \
     ito = ifrom + idelta;                                       \
     if (ito > inum) ito = inum;                                 \
   }
 
+#define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum,          \
+                                nthreads, vecsize)              \
+  {                                                             \
+    tid = omp_get_thread_num();                                 \
+    IP_PRE_omp_range_vec(ifrom, ito, tid, inum, nthreads,	\
+			 vecsize);				\
+  }
+
 #define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \
                                  nthr, vecsize)                 \
   {                                                             \
     tid = omp_get_thread_num();                                 \
     if (nthr <= INTEL_HTHREADS) {                               \
       ifrom = tid*vecsize;                                      \
       ito = inum;                                               \
       ip = nthr*vecsize;                                        \
     } else if (nthr % INTEL_HTHREADS == 0) {                    \
       int nd = nthr / INTEL_HTHREADS;                           \
       int td = tid / INTEL_HTHREADS;                            \
       int tm = tid % INTEL_HTHREADS;                            \
-      IP_PRE_omp_range_id_vec(ifrom, ito, td, inum, nd,         \
-        vecsize);                                               \
+      IP_PRE_omp_range_vec(ifrom, ito, td, inum, nd, vecsize);	\
       ifrom += tm * vecsize;                                    \
       ip = INTEL_HTHREADS * vecsize;                            \
     } else {                                                    \
-      IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, nthr,      \
-                              vecsize);                         \
+      IP_PRE_omp_range_vec(ifrom, ito, tid, inum, nthr,		\
+			   vecsize);				\
       ip = vecsize;                                             \
     }                                                           \
   }
 
 #else
 
 #define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)    \
   {                                                             \
     tid = 0;                                                    \
     ifrom = 0;                                                  \
     ito = inum;                                                 \
   }
 
 #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)       \
   {                                                             \
     ifrom = 0;                                                  \
     ito = inum;                                                 \
   }
 
 #define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)   \
   {                                                             \
     tid = 0;                                                    \
     ifrom = 0;                                                  \
     ito = inum;                                                 \
     ip = 1;                                                     \
   }
 
 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
                              datasize)                          \
 {                                                               \
     ifrom = 0;                                                  \
     ito = inum;                                                 \
 }
 
 #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
                                 nthreads, datasize)             \
 {                                                               \
   tid = 0;                                                      \
   ifrom = 0;                                                    \
   ito = inum;                                                   \
 }
 
 #define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum,          \
                                 nthreads, vecsize)              \
   {                                                             \
     tid = 0;                                                    \
     ifrom = 0;                                                  \
     ito = inum;                                                 \
   }
 
 #define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \
                                  nthr, vecsize)                 \
   {                                                             \
     tid = 0;							\
     ifrom = 0;							\
     ip = 1;							\
     ito = inum;							\
   }
 
 #endif
 
 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
                                   f_stride, pos, ov0, ov1, ov2,         \
                                   ov3, ov4, ov5)                        \
 {                                                                       \
   acc_t *f_scalar = &f_start[0].x;                                      \
   flt_t *x_scalar = &pos[minlocal].x;                                   \
   int f_stride4 = f_stride * 4;                                         \
   _alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64);                         \
   int vwidth;                                                           \
   if (sizeof(acc_t) == sizeof(double))                                  \
     vwidth = INTEL_COMPILE_WIDTH/2;                                     \
   else                                                                  \
     vwidth = INTEL_COMPILE_WIDTH;                                       \
   if (vwidth < 4) vwidth = 4;                                           \
   _use_simd_pragma("vector aligned")                                    \
   _use_simd_pragma("simd")                                              \
   for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0;                 \
   int remainder = lt % vwidth;                                          \
   if (lf > lt) remainder = 0;                                           \
   const int v_range = lt - remainder;                                   \
   if (nthreads == 2) {                                                  \
     acc_t *f_scalar2 = f_scalar + f_stride4;                            \
     for (int n = lf; n < v_range; n += vwidth) {                        \
       _use_simd_pragma("vector aligned")                                \
       _use_simd_pragma("simd")                                          \
       for (int v = 0; v < vwidth; v++) {                                \
         f_scalar[n+v] += f_scalar2[n+v];                                \
         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
       }                                                                 \
       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
       if (vwidth > 4) {                                                 \
         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
       }                                                                 \
       if (vwidth > 8) {                                                 \
         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
       }                                                                 \
     }                                                                   \
     _use_simd_pragma("vector aligned")                                  \
     _use_simd_pragma("ivdep")                                           \
     _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
     for (int n = v_range; n < lt; n++)                                  \
       f_scalar[n] += f_scalar2[n];                                      \
   } else if (nthreads==4) {                                             \
     acc_t *f_scalar2 = f_scalar + f_stride4;                            \
     acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
     acc_t *f_scalar4 = f_scalar3 + f_stride4;                           \
     for (int n = lf; n < v_range; n += vwidth) {                        \
       _use_simd_pragma("vector aligned")                                \
       _use_simd_pragma("simd")                                          \
       for (int v = 0; v < vwidth; v++) {                                \
         f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] +              \
           f_scalar4[n+v];                                               \
         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
       }                                                                 \
       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
       if (vwidth > 4) {                                                 \
         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
       }                                                                 \
       if (vwidth > 8) {                                                 \
         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
       }                                                                 \
     }                                                                   \
     _use_simd_pragma("vector aligned")                                  \
     _use_simd_pragma("ivdep")                                           \
     _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
     for (int n = v_range; n < lt; n++)                                  \
       f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];        \
   } else if (nthreads==1) {                                             \
     for (int n = lf; n < v_range; n += vwidth) {                        \
       _use_simd_pragma("vector aligned")                                \
       _use_simd_pragma("simd")                                          \
       for (int v = 0; v < vwidth; v++)                                  \
         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
       if (vwidth > 4) {                                                 \
         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
       }                                                                 \
       if (vwidth > 8) {                                                 \
         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
       }                                                                 \
     }                                                                   \
   } else if (nthreads==3) {                                             \
     acc_t *f_scalar2 = f_scalar + f_stride4;                            \
     acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
     for (int n = lf; n < v_range; n += vwidth) {                        \
       _use_simd_pragma("vector aligned")                                \
       _use_simd_pragma("simd")                                          \
       for (int v = 0; v < vwidth; v++) {                                \
         f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v];               \
         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
       }                                                                 \
       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
       if (vwidth > 4) {                                                 \
         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
       }                                                                 \
       if (vwidth > 8) {                                                 \
         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
       }                                                                 \
     }                                                                   \
     _use_simd_pragma("vector aligned")                                  \
     _use_simd_pragma("ivdep")                                           \
     _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
     for (int n = v_range; n < lt; n++)                                  \
       f_scalar[n] += f_scalar2[n] + f_scalar3[n];                       \
   }                                                                     \
   for (int n = v_range; n < lt; n += 4) {                               \
     _use_simd_pragma("vector aligned")                                  \
     _use_simd_pragma("ivdep")                                           \
     for (int v = 0; v < 4; v++)                                         \
       ovv[v] += f_scalar[n+v] * x_scalar[n+v];                          \
     ov3 += f_scalar[n+1] * x_scalar[n+0];                               \
     ov4 += f_scalar[n+2] * x_scalar[n+0];                               \
     ov5 += f_scalar[n+2] * x_scalar[n+1];                               \
   }                                                                     \
   ov0 += ovv[0];                                                        \
   ov1 += ovv[1];                                                        \
   ov2 += ovv[2];                                                        \
   if (vwidth > 4) {                                                     \
     ov0 += ovv[4];                                                      \
     ov1 += ovv[5];                                                      \
     ov2 += ovv[6];                                                      \
   }                                                                     \
   if (vwidth > 8) {                                                     \
     ov0 += ovv[8] + ovv[12];                                            \
     ov1 += ovv[9] + ovv[13];                                            \
     ov2 += ovv[10] + ovv[14];                                           \
   }                                                                     \
 }
 
 #define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,       \
                                f_stride, pos, offload, vflag, ov0, ov1, \
                                ov2, ov3, ov4, ov5)                      \
 {                                                                       \
   int o_range = (nall - minlocal) * 4;                                  \
   IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads,       \
                             sizeof(acc_t));                             \
                                                                         \
   acc_t *f_scalar = &f_start[0].x;                                      \
   int f_stride4 = f_stride * 4;                                         \
   int t;                                                                \
   if (vflag == 2) t = 4; else t = 1;                                    \
   acc_t *f_scalar2 = f_scalar + f_stride4 * t;                          \
   for ( ; t < nthreads; t++) {                                          \
     _use_simd_pragma("vector aligned")                                  \
     _use_simd_pragma("simd")                                            \
     for (int n = iifrom; n < iito; n++)                                 \
       f_scalar[n] += f_scalar2[n];                                      \
     f_scalar2 += f_stride4;                                             \
   }                                                                     \
                                                                         \
   if (vflag == 2) {                                                     \
     int nt_min = MIN(4,nthreads);                                       \
     IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start,  \
                               f_stride, pos, ov0, ov1, ov2, ov3, ov4,   \
                               ov5);                                     \
   }                                                                     \
 }
 
 #ifdef _LMP_INTEL_OFFLOAD
 #include <sys/time.h>
 
 __declspec( target (mic))
 inline double MIC_Wtime() {
   double time;
   struct timeval tv;
 
   gettimeofday(&tv, NULL);
   time = 1.0 * tv.tv_sec + 1.0e-6 * tv.tv_usec;
   return time;
 }
 
 #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
                                      nlocal, nall)                      \
 {                                                                       \
     if (fix->separate_buffers() && ago != 0) {                          \
     fix->start_watch(TIME_PACK);                                        \
     if (offload) {                                                      \
       int packthreads;                                                  \
       if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;\
       else packthreads = 1;                                             \
       _use_omp_pragma("omp parallel if(packthreads > 1)")               \
       {                                                                 \
         int ifrom, ito, tid;                                            \
         IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,              \
                                   packthreads, sizeof(flt_t));          \
         buffers->thr_pack_cop(ifrom, ito, 0);                           \
         int nghost = nall - nlocal;                                     \
         if (nghost) {                                                   \
           IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,        \
                                  packthreads, sizeof(flt_t));           \
           buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal,           \
                                 fix->offload_min_ghost() - nlocal,      \
                                 ago == 1);                              \
         }                                                               \
       }                                                                 \
     } else {                                                            \
       buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);         \
       buffers->thr_pack_host(nlocal, nall,                              \
                              fix->host_min_ghost()-nlocal);             \
     }                                                                   \
     fix->stop_watch(TIME_PACK);                                         \
   }                                                                     \
 }
 
 #define IP_PRE_get_transfern(ago, newton, eflag, vflag,                 \
                              buffers, offload, fix, separate_flag,      \
                              x_size, q_size, ev_size, f_stride)         \
 {                                                                       \
   separate_flag = 0;                                                    \
   if (ago == 0) {                                                       \
     x_size = 0;                                                         \
     q_size = nall;                                                      \
     if (offload) {                                                      \
       if (fix->separate_buffers()) {                                    \
         if (lmp->atom->torque)                                          \
           separate_flag = 2;                                            \
         else                                                            \
           separate_flag = 1;                                            \
       } else                                                            \
         separate_flag = 3;                                              \
     }                                                                   \
   } else {                                                              \
     x_size = nall;                                                      \
     q_size = 0;                                                         \
   }                                                                     \
   ev_size = 0;                                                          \
   if (eflag) ev_size = 2;                                               \
   if (vflag) ev_size = 8;                                               \
   if (newton)                                                           \
     f_stride = buffers->get_stride(nall);                               \
   else                                                                  \
     f_stride = buffers->get_stride(inum);                               \
 }
 
 #define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,          \
                            ev_global)                                   \
 {                                                                       \
   if (offload) {                                                        \
     tc = buffers->get_off_threads();                                    \
     f_start = buffers->get_off_f();                                     \
     ev_global = buffers->get_ev_global();                               \
   } else {                                                              \
     tc = comm->nthreads;                                                \
     f_start = buffers->get_f();                                         \
     fix->start_watch(TIME_HOST_PAIR);                                   \
     ev_global = buffers->get_ev_global_host();                          \
   }                                                                     \
 }
 
 #define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,  \
                                   f_stride, x, q)                       \
 {                                                                       \
   if (separate_flag) {                                                  \
     if (separate_flag < 3) {                                            \
       int all_local = nlocal;                                           \
       int ghost_min = overflow[LMP_GHOST_MIN];                          \
       nlocal = overflow[LMP_LOCAL_MAX] + 1;                             \
       int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;             \
       if (nghost < 0) nghost = 0;                                       \
       nall = nlocal + nghost;                                           \
       separate_flag--;                                                  \
       int flength;                                                      \
       if (newton) flength = nall;                                       \
       else flength = nlocal;                                            \
       IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),             \
                            separate_flag);                              \
       if (nghost) {                                                     \
         if (nlocal < all_local || ghost_min > all_local) {              \
           memmove(x + nlocal, x + ghost_min,                            \
                   (nall - nlocal) * sizeof(ATOM_T));                    \
           if (q != 0)                                                   \
             memmove((void *)(q + nlocal), (void *)(q + ghost_min),      \
                     (nall - nlocal) * sizeof(flt_t));                   \
         }                                                               \
       }                                                                 \
     }                                                                   \
     x[nall].x = INTEL_BIGP;                                             \
     x[nall].y = INTEL_BIGP;                                             \
     x[nall].z = INTEL_BIGP;                                             \
   }                                                                     \
 }
 
 #define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,       \
                                 f_start, f_stride, x, offload, vflag,   \
                                 ov0, ov1, ov2, ov3, ov4, ov5)           \
 {                                                                       \
   if (newton) {                                                         \
     _use_omp_pragma("omp barrier");                                     \
     IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,           \
                            f_stride, x, offload, vflag, ov0, ov1, ov2,  \
                            ov3, ov4, ov5);                              \
   }                                                                     \
 }
 
 #define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,    \
                             ov0, ov1, ov2, ov3, ov4, ov5)
 
 #else
 
 #define MIC_Wtime MPI_Wtime
 #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
                                      nlocal, nall)
 
 #define IP_PRE_get_transfern(ago, newton, eflag, vflag,                 \
                              buffers, offload, fix, separate_flag,      \
                              x_size, q_size, ev_size, f_stride)         \
 {                                                                       \
   separate_flag = 0;                                                    \
   int f_length;                                                         \
   if (newton)                                                           \
     f_length = nall;                                                    \
   else                                                                  \
     f_length = nlocal;                                                  \
   f_stride = buffers->get_stride(f_length);                             \
 }
 
 #define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,          \
                            ev_global)                                   \
 {                                                                       \
   tc = comm->nthreads;                                                  \
   f_start = buffers->get_f();                                           \
   fix->start_watch(TIME_HOST_PAIR);                                     \
   ev_global = buffers->get_ev_global_host();                            \
 }
 
 #define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,  \
                                   f_stride, x, q)
 
 #define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,       \
                                 f_start, f_stride, x, offload, vflag,   \
                                 ov0, ov1, ov2, ov3, ov4, ov5)           \
 {                                                                       \
   if (newton) {                                                         \
     if (vflag == 2 && nthreads > INTEL_HTHREADS) {                      \
       _use_omp_pragma("omp barrier");                                   \
       buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2,    \
                             ov3, ov4, ov5);                             \
     }                                                                   \
   }                                                                     \
 }
 
 #define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,    \
                             ov0, ov1, ov2, ov3, ov4, ov5)               \
 {                                                                       \
   if (newton) {                                                         \
     if (vflag == 2 && nthreads <= INTEL_HTHREADS) {                     \
       int lt = nall * 4;                                                \
       buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1,     \
                                ov2, ov3, ov4, ov5);                     \
     }                                                                   \
   }                                                                     \
 }
 
 #endif
 
 #define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz)            \
 {                                                                       \
   if (vflag == 1) {                                                     \
     sv0 += delx * delx * fpair;                                         \
     sv1 += dely * dely * fpair;                                         \
     sv2 += delz * delz * fpair;                                         \
     sv3 += delx * dely * fpair;                                         \
     sv4 += delx * delz * fpair;                                         \
     sv5 += dely * delz * fpair;                                         \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz)         \
 {                                                                       \
   if (vflag == 1) {                                                     \
     sv0 += dx * fpx;                                                    \
     sv1 += dy * fpy;                                                    \
     sv2 += dz * fpz;                                                    \
     sv3 += dx * fpy;                                                    \
     sv4 += dx * fpz;                                                    \
     sv5 += dy * fpz;                                                    \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_nbor3(vflag, fj, fk, delx, dely, delz, delr2)   \
 {                                                                       \
   if (vflag == 1) {                                                     \
     sv0 += delx * fj[0] + delr2[0] * fk[0];                             \
     sv1 += dely * fj[1] + delr2[1] * fk[1];                             \
     sv2 += delz * fj[2] + delr2[2] * fk[2];                             \
     sv3 += delx * fj[1] + delr2[0] * fk[1];                             \
     sv4 += delx * fj[2] + delr2[0] * fk[2];                             \
     sv5 += dely * fj[2] + delr2[1] * fk[2];                             \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_nbor3v(vflag, fj0, fj1, fj2, delx, dely, delz)  \
 {                                                                       \
   if (vflag == 1) {                                                     \
     sv0 += delx * fj0;                                                  \
     sv1 += dely * fj1;                                                  \
     sv2 += delz * fj2;                                                  \
     sv3 += delx * fj1;                                                  \
     sv4 += delx * fj2;                                                  \
     sv5 += dely * fj2;                                                  \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_bond(eflag, VFLAG, eatom, vflag, ebond, i1, i2, \
                              fbond, delx, dely, delz, obond, force,     \
                              newton, nlocal, ov0, ov1, ov2, ov3, ov4,   \
                              ov5)                                       \
 {                                                                       \
   flt_t ev_pre;                                                         \
   if (newton) ev_pre = (flt_t)1.0;                                      \
   else {                                                                \
     ev_pre = (flt_t)0.0;                                                \
     if (i1 < nlocal) ev_pre += (flt_t)0.5;                              \
     if (i2 < nlocal) ev_pre += (flt_t)0.5;                              \
   }                                                                     \
                                                                         \
   if (eflag) {                                                          \
     obond += ev_pre * ebond;                                            \
     if (eatom) {                                                        \
       flt_t halfeng = ebond * (flt_t)0.5;                               \
       if (newton || i1 < nlocal) f[i1].w += halfeng;                    \
       if (newton || i2 < nlocal) f[i2].w += halfeng;                    \
     }                                                                   \
   }                                                                     \
                                                                         \
   if (VFLAG && vflag) {                                                 \
     ov0 += ev_pre * (delx * delx * fbond);                              \
     ov1 += ev_pre * (dely * dely * fbond);                              \
     ov2 += ev_pre * (delz * delz * fbond);                              \
     ov3 += ev_pre * (delx * dely * fbond);                              \
     ov4 += ev_pre * (delx * delz * fbond);                              \
     ov5 += ev_pre * (dely * delz * fbond);                              \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_angle(eflag, VFLAG, eatom, vflag, eangle, i1,   \
                               i2, i3, f1x, f1y, f1z, f3x, f3y, f3z,     \
                               delx1, dely1, delz1, delx2, dely2, delz2, \
                               oeangle, force, newton, nlocal, ov0, ov1, \
                               ov2, ov3, ov4, ov5)                       \
 {                                                                       \
   flt_t ev_pre;                                                         \
   if (newton) ev_pre = (flt_t)1.0;                                      \
   else {                                                                \
     ev_pre = (flt_t)0.0;                                                \
     if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
     if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
     if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
   }                                                                     \
                                                                         \
   if (eflag) {                                                          \
     oeangle += ev_pre * eangle;                                         \
     if (eatom) {                                                        \
       flt_t thirdeng = eangle * (flt_t)0.3333333333333333;              \
       if (newton || i1 < nlocal) f[i1].w += thirdeng;                   \
       if (newton || i2 < nlocal) f[i2].w += thirdeng;                   \
       if (newton || i3 < nlocal) f[i3].w += thirdeng;                   \
     }                                                                   \
   }                                                                     \
                                                                         \
   if (VFLAG && vflag) {                                                 \
     ov0 += ev_pre * (delx1 * f1x + delx2 * f3x);                        \
     ov1 += ev_pre * (dely1 * f1y + dely2 * f3y);                        \
     ov2 += ev_pre * (delz1 * f1z + delz2 * f3z);                        \
     ov3 += ev_pre * (delx1 * f1y + delx2 * f3y);                        \
     ov4 += ev_pre * (delx1 * f1z + delx2 * f3z);                        \
     ov5 += ev_pre * (dely1 * f1z + dely2 * f3z);                        \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_dihed(eflag, VFLAG, eatom, vflag, deng, i1, i2, \
                               i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\
                               f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y,   \
                               vb2z, vb3x, vb3y, vb3z, oedihedral, force,\
                               newton, nlocal, ov0, ov1, ov2, ov3, ov4,  \
                               ov5)                                      \
 {                                                                       \
   flt_t ev_pre;                                                         \
   if (newton) ev_pre = (flt_t)1.0;                                      \
   else {                                                                \
     ev_pre = (flt_t)0.0;                                                \
     if (i1 < nlocal) ev_pre += (flt_t)0.25;                             \
     if (i2 < nlocal) ev_pre += (flt_t)0.25;                             \
     if (i3 < nlocal) ev_pre += (flt_t)0.25;                             \
     if (i4 < nlocal) ev_pre += (flt_t)0.25;                             \
   }                                                                     \
                                                                         \
   if (eflag) {                                                          \
     oedihedral += ev_pre * deng;                                        \
     if (eatom) {                                                        \
       flt_t qdeng = deng * (flt_t)0.25;                                 \
       if (newton || i1 < nlocal) f[i1].w += qdeng;                      \
       if (newton || i2 < nlocal) f[i2].w += qdeng;                      \
       if (newton || i3 < nlocal) f[i3].w += qdeng;                      \
       if (newton || i4 < nlocal) f[i4].w += qdeng;                      \
     }                                                                   \
   }                                                                     \
                                                                         \
   if (VFLAG && vflag) {                                                 \
     ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x);            \
     ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y);            \
     ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z);            \
     ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y);            \
     ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z);            \
     ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z);            \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp)            \
 {                                                                       \
   if (eflag) {                                                          \
     f[i].w += fwtmp;                                                    \
     oevdwl += sevdwl;                                                   \
   }                                                                     \
   if (newton == 0 && vflag == 1) {                                      \
     ov0 += sv0;                                                         \
     ov1 += sv1;                                                         \
     ov2 += sv2;                                                         \
     ov3 += sv3;                                                         \
     ov4 += sv4;                                                         \
     ov5 += sv5;                                                         \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp)           \
 {                                                                       \
   if (eflag) {                                                          \
     f[i].w += fwtmp;                                                    \
     oevdwl += sevdwl;                                                   \
     oecoul += secoul;                                                   \
   }                                                                     \
   if (newton == 0 && vflag == 1) {                                      \
     ov0 += sv0;                                                         \
     ov1 += sv1;                                                         \
     ov2 += sv2;                                                         \
     ov3 += sv3;                                                         \
     ov4 += sv4;                                                         \
     ov5 += sv5;                                                         \
   }                                                                     \
 }
 
 }
 
 #endif
diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
index 07beae1e4..e32fd0666 100644
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
@@ -1,553 +1,553 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Rodrigo Canales (RWTH Aachen University)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "pair_buck_coul_cut_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "group.h"
 #include "kspace.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
 #include "suffix.h"
 #include "force.h"
 #include "modify.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define C_FORCE_T typename ForceConst<flt_t>::c_force_t
 #define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
 #define C_CUT_T typename ForceConst<flt_t>::c_cut_t
 
 PairBuckCoulCutIntel::PairBuckCoulCutIntel(LAMMPS *lmp) :
   PairBuckCoulCut(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 PairBuckCoulCutIntel::~PairBuckCoulCutIntel()
 {
 }
 
 void PairBuckCoulCutIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairBuckCoulCutIntel::compute(int eflag, int vflag,
                                    IntelBuffers<flt_t,acc_t> *buffers,
                                    const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
     #if defined(_OPENMP)
     #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
                                 packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
   if (eflag) {
     if (force->newton_pair) {
       eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
       eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
                                 IntelBuffers<flt_t,acc_t> *buffers,
                                 const ForceConst<flt_t> &fc,
                                 const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
   const flt_t qqrd2e = force->qqrd2e;
 
   const C_FORCE_T * _noalias const c_force = fc.c_force[0];
   const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0];
   const C_CUT_T * _noalias const c_cut = fc.c_cut[0];
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
   // Redeclare as local variables for offload
   const int ncoulmask = this->ncoulmask;
   const int ncoulshiftbits = this->ncoulshiftbits;
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload)                 \
     in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
     in(c_force, c_energy, c_cut:length(0) alloc_if(0) free_if(0))      \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,inum,nall,ntypes,vflag,eatom) \
     in(f_stride,nlocal,minlocal,separate_flag,offload) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
                               f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = oecoul = (acc_t)0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       int foff;
       if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
       else foff = -minlocal;
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
         const C_CUT_T * _noalias const c_cuti = c_cut + ptr_off;
         const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
         if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcecoul, forcebuck, evdwl, ecoul;
           forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
 
           const int sbindex = jlist[jj] >> SBBITS & 3;
           const int j = jlist[jj] & NEIGHMASK;
 
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
           const flt_t r = sqrt(rsq);
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
           #ifdef INTEL_VMASK
           if (rsq < c_cuti[jtype].cut_coulsq) {
           #endif
             forcecoul = qqrd2e * qtmp*q[j]/r;
             if (EFLAG)
               ecoul = forcecoul;
             if (sbindex){
               const flt_t factor_coul = special_coul[sbindex];
               forcecoul *= factor_coul;
               if(EFLAG)
                 ecoul *= factor_coul;
 
             }
           #ifdef INTEL_VMASK
           }
           #else
           if (rsq >= c_cuti[jtype].cut_coulsq)
             { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
           #endif
 
           #ifdef INTEL_VMASK
           if (rsq < c_cuti[jtype].cut_ljsq) {
           #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
             flt_t rexp = exp(-r * c_forcei[jtype].rhoinv);
             forcebuck = r * rexp * c_forcei[jtype].buck1 -
               r6inv * c_forcei[jtype].buck2;
             if (EFLAG)
               evdwl = rexp * c_energyi[jtype].a -
                 r6inv * c_energyi[jtype].c -
                 c_energyi[jtype].offset;
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcebuck *= factor_lj;
               if (EFLAG)
                 evdwl *= factor_lj;
             }
           #ifdef INTEL_VMASK
           }
           #else
           if (rsq >= c_cuti[jtype].cut_ljsq)
             { forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
           #endif
 
           #ifdef INTEL_VMASK
           if (rsq < c_cuti[jtype].cutsq) {
           #endif
             const flt_t fpair = (forcecoul + forcebuck) * r2inv;
             const flt_t fpx = fpair * delx;
             fxtmp += fpx;
             if (NEWTON_PAIR) f[j].x -= fpx;
             const flt_t fpy = fpair * dely;
             fytmp += fpy;
             if (NEWTON_PAIR) f[j].y -= fpy;
             const flt_t fpz = fpair * delz;
             fztmp += fpz;
             if (NEWTON_PAIR) f[j].z -= fpz;
 
 
             if (EFLAG) {
               sevdwl += evdwl;
               secoul += ecoul;
               if (eatom) {
                 fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
                 if (NEWTON_PAIR)
                   f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
               }
             }
             if (NEWTON_PAIR == 0)
               IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
         if (NEWTON_PAIR) {
           f[i].x += fxtmp;
           f[i].y += fytmp;
           f[i].z += fztmp;
         } else {
           f[i].x = fxtmp;
           f[i].y = fytmp;
           f[i].z = fztmp;
         }
         IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
                               f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
                               ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
       ev_global[0] = oevdwl;
       ev_global[1] = oecoul;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
         ov3 *= (acc_t)0.5;
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end of offload region
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairBuckCoulCutIntel::init_style()
 {
   PairBuckCoulCut::init_style();
   if (force->newton_pair == 0) {
     neighbor->requests[neighbor->nrequest-1]->half = 0;
     neighbor->requests[neighbor->nrequest-1]->full = 1;
   }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 
 }
 
 template <class flt_t, class acc_t>
 void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
   int tp1 = atom->ntypes + 1;
   int ntable = 1;
   if (ncoultablebits)
     for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
 
   fc.set_ntypes(tp1, ntable, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i, j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_coul[i] = force->special_coul[i];
     fc.special_coul[0] = 1.0;
     fc.special_lj[0] = 1.0;
   }
 
-  for (int i = 0; i < tp1; i++) {
-    for (int j = 0; j < tp1; j++) {
+  for (int i = 1; i < tp1; i++) {
+    for (int j = 1; j < tp1; j++) {
       fc.c_cut[i][j].cutsq = cutsq[i][j];
       fc.c_cut[i][j].cut_ljsq = cut_ljsq[i][j];
       fc.c_cut[i][j].cut_coulsq = cut_coulsq[i][j];
       fc.c_force[i][j].buck1 = buck1[i][j];
       fc.c_force[i][j].buck2 = buck2[i][j];
       fc.c_force[i][j].rhoinv = rhoinv[i][j];
       fc.c_energy[i][j].a = a[i][j];
       fc.c_energy[i][j].c = c[i][j];
       fc.c_energy[i][j].offset = offset[i][j];
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   flt_t * special_lj = fc.special_lj;
   flt_t * special_coul = fc.special_coul;
   C_FORCE_T * c_force = fc.c_force[0];
   C_ENERGY_T * c_energy = fc.c_energy[0];
   C_CUT_T * c_cut = fc.c_cut[0];
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
     in(c_force, c_energy, c_cut: length(tp1sq) alloc_if(0) free_if(0))   \
     in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                            const int ntable,
                                                            Memory *memory,
                                                            const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       c_cut_t * oc_cut = c_cut[0];
 
       if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
           oc_energy != NULL && ospecial_coul != NULL &&
           _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
           nocopy(oc_force, oc_energy: alloc_if(0) free_if(1))        \
           nocopy(oc_cut: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(c_force);
       _memory->destroy(c_energy);
       _memory->destroy(c_cut);
 
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(c_force,ntypes,ntypes,"fc.c_force");
       memory->create(c_energy,ntypes,ntypes,"fc.c_energy");
       memory->create(c_cut,ntypes,ntypes,"fc.c_cut");
 
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       c_cut_t * oc_cut = c_cut[0];
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
           oc_energy != NULL && ospecial_coul != NULL &&
           cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
           nocopy(oc_force: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_energy: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_cut: length(tp1sq) alloc_if(1) free_if(0))
 
       }
       #endif
     }
   }
   _ntypes=ntypes;
   _ntable=ntable;
   _memory=memory;
 }
diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
index 995e2e858..2b9fea7a5 100644
--- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
@@ -1,689 +1,689 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Rodrigo Canales (RWTH Aachen University)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "pair_buck_coul_long_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "group.h"
 #include "kspace.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
 #include "suffix.h"
 #include "force.h"
 #include "modify.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define C_FORCE_T typename ForceConst<flt_t>::c_force_t
 #define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
 #define TABLE_T typename ForceConst<flt_t>::table_t
 
 PairBuckCoulLongIntel::PairBuckCoulLongIntel(LAMMPS *lmp) :
   PairBuckCoulLong(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 PairBuckCoulLongIntel::~PairBuckCoulLongIntel()
 {
 }
 
 void PairBuckCoulLongIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairBuckCoulLongIntel::compute(int eflag, int vflag,
                                     IntelBuffers<flt_t,acc_t> *buffers,
                                     const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
     #if defined(_OPENMP)
     #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
                                 packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
   if (eflag) {
     if (force->newton_pair) {
       eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
       eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
                                  IntelBuffers<flt_t,acc_t> *buffers,
                                  const ForceConst<flt_t> &fc,
                                  const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
   const flt_t qqrd2e = force->qqrd2e;
 
   const C_FORCE_T * _noalias const c_force = fc.c_force[0];
   const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0];
   const flt_t * _noalias const rho_inv = fc.rho_inv[0];
   const TABLE_T * _noalias const table = fc.table;
   const flt_t * _noalias const etable = fc.etable;
   const flt_t * _noalias const detable = fc.detable;
   const flt_t * _noalias const ctable = fc.ctable;
   const flt_t * _noalias const dctable = fc.dctable;
   const flt_t g_ewald = fc.g_ewald;
   const flt_t tabinnersq = fc.tabinnersq;
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   flt_t * _noalias const ccachex = buffers->get_ccachex();
   flt_t * _noalias const ccachey = buffers->get_ccachey();
   flt_t * _noalias const ccachez = buffers->get_ccachez();
   flt_t * _noalias const ccachew = buffers->get_ccachew();
   int * _noalias const ccachei = buffers->get_ccachei();
   int * _noalias const ccachej = buffers->get_ccachej();
   const int ccache_stride = _ccache_stride;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
   // Redeclare as local variables for offload
   const int ncoultablebits = this->ncoultablebits;
   const int ncoulmask = this->ncoulmask;
   const int ncoulshiftbits = this->ncoulshiftbits;
   #ifdef INTEL_ALLOW_TABLE
   #define ITABLE_IN in(table,etable,detable:length(0) alloc_if(0) free_if(0)) \
                     in(ctable,dctable:length(0) alloc_if(0) free_if(0)) \
                     in(ncoultablebits,tabinnersq,ncoulmask,ncoulshiftbits)
   #else
   #define ITABLE_IN
   #endif
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload)                 \
     in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
     in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
     in(rho_inv:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
     in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload)    \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     ITABLE_IN signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
                               f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = oecoul = (acc_t)0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       int foff;
       if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
       else foff = -minlocal;
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       const int toffs = tid * ccache_stride;
       flt_t * _noalias const tdelx = ccachex + toffs;
       flt_t * _noalias const tdely = ccachey + toffs;
       flt_t * _noalias const tdelz = ccachez + toffs;
       flt_t * _noalias const trsq = ccachew + toffs;
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
 
       for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
         const flt_t * _noalias const rho_invi = rho_inv + ptr_off;
 
         const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
         if (NEWTON_PAIR == 0)
           if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           const int j = jlist[jj] & NEIGHMASK;
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
           if (rsq < c_forcei[jtype].cutsq) {
             trsq[ej]=rsq;
             tdelx[ej]=delx;
             tdely[ej]=dely;
             tdelz[ej]=delz;
             tjtype[ej]=jtype;
             tj[ej]=jlist[jj];
             ej++;
           }
         }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
                                  sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcebuck, evdwl, ecoul;
           forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
 
           const int j = tj[jj] & NEIGHMASK;
           const int sbindex = tj[jj] >> SBBITS & 3;
           const int jtype = tjtype[jj];
           const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
           const flt_t r = (flt_t)1.0 / sqrt(r2inv);
 
           #ifdef INTEL_ALLOW_TABLE
           if (!ncoultablebits || rsq <= tabinnersq) {
           #endif
             const flt_t A1 =  0.254829592;
             const flt_t A2 = -0.284496736;
             const flt_t A3 =  1.421413741;
             const flt_t A4 = -1.453152027;
             const flt_t A5 =  1.061405429;
             const flt_t EWALD_F = 1.12837917;
             const flt_t INV_EWALD_P = 1.0 / 0.3275911;
 
             const flt_t grij = g_ewald * r;
             const flt_t expm2 = exp(-grij * grij);
             const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
             const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
             const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
             forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
             if (EFLAG) ecoul = prefactor * erfc;
 
             const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
               prefactor;
             forcecoul -= adjust;
             if (EFLAG) ecoul -= adjust;
 
           #ifdef INTEL_ALLOW_TABLE
           } else {
             float rsq_lookup = rsq;
             const int itable = (__intel_castf32_u32(rsq_lookup) &
               ncoulmask) >> ncoulshiftbits;
             const flt_t fraction = (rsq_lookup - table[itable].r) *
               table[itable].dr;
 
             const flt_t tablet = table[itable].f +
               fraction * table[itable].df;
             forcecoul = qtmp * q[j] * tablet;
             if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
               fraction * detable[itable]);
             if (sbindex) {
               const flt_t table2 = ctable[itable] +
                 fraction * dctable[itable];
               const flt_t prefactor = qtmp * q[j] * table2;
               const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
                 prefactor;
               forcecoul -= adjust;
               if (EFLAG) ecoul -= adjust;
             }
           }
           #endif
 
           #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cut_ljsq) {
           #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
             flt_t rexp = exp(-r * rho_invi[jtype]);
             forcebuck = r * rexp * c_forcei[jtype].buck1 -
               r6inv * c_forcei[jtype].buck2;
             if (EFLAG) evdwl = rexp * c_energyi[jtype].a -
                          r6inv * c_energyi[jtype].c -
                          c_energyi[jtype].offset;
 
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcebuck *= factor_lj;
               if (EFLAG) evdwl *= factor_lj;
             }
           #ifdef INTEL_VMASK
           }
           #else
           if (rsq > c_forcei[jtype].cut_ljsq)
             { forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
           #endif
 
           const flt_t fpair = (forcecoul + forcebuck) * r2inv;
           const flt_t fpx = fpair * tdelx[jj];
           fxtmp += fpx;
           if (NEWTON_PAIR) f[j].x -= fpx;
           const flt_t fpy = fpair * tdely[jj];
           fytmp += fpy;
           if (NEWTON_PAIR) f[j].y -= fpy;
           const flt_t fpz = fpair * tdelz[jj];
           fztmp += fpz;
           if (NEWTON_PAIR) f[j].z -= fpz;
 
           if (EFLAG) {
             sevdwl += evdwl;
             secoul += ecoul;
             if (eatom) {
               fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
               if (NEWTON_PAIR)
                 f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
             }
           }
           if (NEWTON_PAIR == 0)
             IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
                                   fpx, fpy, fpz);
         } // for jj
         if (NEWTON_PAIR) {
           f[i].x += fxtmp;
           f[i].y += fytmp;
           f[i].z += fztmp;
         } else {
           f[i].x = fxtmp;
           f[i].y = fytmp;
           f[i].z = fztmp;
         }
         IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
                               f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
                               ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
       ev_global[0] = oevdwl;
       ev_global[1] = oecoul;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
         ov3 *= (acc_t)0.5;
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end of offload region
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairBuckCoulLongIntel::init_style()
 {
   PairBuckCoulLong::init_style();
   if (force->newton_pair == 0) {
     neighbor->requests[neighbor->nrequest-1]->half = 0;
     neighbor->requests[neighbor->nrequest-1]->full = 1;
   }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 
   _lrt = fix->lrt();
 }
 
 template <class flt_t, class acc_t>
 void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
   int off_ccache = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop >= 0) off_ccache = 1;
   #endif
   buffers->grow_ccache(off_ccache, comm->nthreads, 1);
   _ccache_stride = buffers->ccache_stride();
 
   int tp1 = atom->ntypes + 1;
   int ntable = 1;
   if (ncoultablebits)
     for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
 
   fc.set_ntypes(tp1, ntable, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i, j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   fc.g_ewald = force->kspace->g_ewald;
   fc.tabinnersq = tabinnersq;
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_coul[i] = force->special_coul[i];
     fc.special_coul[0] = 1.0;
     fc.special_lj[0] = 1.0;
   }
 
-  for (int i = 0; i < tp1; i++) {
-    for (int j = 0; j < tp1; j++) {
+  for (int i = 1; i < tp1; i++) {
+    for (int j = 1; j < tp1; j++) {
       if (cutsq[i][j] < cut_ljsq[i][j])
         error->all(FLERR,
          "Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic");
       fc.c_force[i][j].cutsq = cutsq[i][j];
       fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
       fc.c_force[i][j].buck1 = buck1[i][j];
       fc.c_force[i][j].buck2 = buck2[i][j];
       fc.rho_inv[i][j] = rhoinv[i][j];
       fc.c_energy[i][j].a = a[i][j];
       fc.c_energy[i][j].c = c[i][j];
       fc.c_energy[i][j].offset = offset[i][j];
       fc.c_energy[i][j].pad = rhoinv[i][j];
     }
   }
 
   if (ncoultablebits) {
     for (int i = 0; i < ntable; i++) {
       fc.table[i].r = rtable[i];
       fc.table[i].dr = drtable[i];
       fc.table[i].f = ftable[i];
       fc.table[i].df = dftable[i];
       fc.etable[i] = etable[i];
       fc.detable[i] = detable[i];
       fc.ctable[i] = ctable[i];
       fc.dctable[i] = dctable[i];
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   flt_t * special_lj = fc.special_lj;
   flt_t * special_coul = fc.special_coul;
   C_FORCE_T * c_force = fc.c_force[0];
   C_ENERGY_T * c_energy = fc.c_energy[0];
   TABLE_T * table = fc.table;
   flt_t * rho_inv = fc.rho_inv[0];
   flt_t * etable = fc.etable;
   flt_t * detable = fc.detable;
   flt_t * ctable = fc.ctable;
   flt_t * dctable = fc.dctable;
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
     in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
     in(rho_inv: length(tp1sq) alloc_if(0) free_if(0)) \
     in(table: length(ntable) alloc_if(0) free_if(0)) \
     in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
     in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                            const int ntable,
                                                            Memory *memory,
                                                            const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       table_t * otable = table;
       flt_t * orho_inv = rho_inv[0];
       flt_t * oetable = etable;
       flt_t * odetable = detable;
       flt_t * octable = ctable;
       flt_t * odctable = dctable;
       if (ospecial_lj != NULL && oc_force != NULL && orho_inv != NULL &&
           oc_energy != NULL && otable != NULL && oetable != NULL &&
           odetable != NULL && octable != NULL && odctable != NULL &&
           ospecial_coul != NULL && _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
           nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
           nocopy(orho_inv: alloc_if(0) free_if(1)) \
           nocopy(otable: alloc_if(0) free_if(1)) \
           nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(c_force);
       _memory->destroy(c_energy);
       _memory->destroy(table);
       _memory->destroy(rho_inv);
       _memory->destroy(etable);
       _memory->destroy(detable);
       _memory->destroy(ctable);
       _memory->destroy(dctable);
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(c_force,ntypes,ntypes,"fc.c_force");
       memory->create(c_energy,ntypes,ntypes,"fc.c_energy");
       memory->create(rho_inv,ntypes,ntypes,"fc.rho_inv");
       memory->create(table,ntable,"pair:fc.table");
       memory->create(etable,ntable,"pair:fc.etable");
       memory->create(detable,ntable,"pair:fc.detable");
       memory->create(ctable,ntable,"pair:fc.ctable");
       memory->create(dctable,ntable,"pair:fc.dctable");
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       table_t * otable = table;
       flt_t * orho_inv = rho_inv[0];
       flt_t * oetable = etable;
       flt_t * odetable = detable;
       flt_t * octable = ctable;
       flt_t * odctable = dctable;
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oc_force != NULL && orho_inv != NULL &&
           oc_energy != NULL && otable !=NULL && oetable != NULL &&
           odetable != NULL && octable != NULL && odctable != NULL &&
           ospecial_coul != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
           nocopy(oc_force: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_energy: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(orho_inv: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(otable: length(ntable) alloc_if(1) free_if(0)) \
           nocopy(oetable,odetable: length(ntable) alloc_if(1) free_if(0)) \
           nocopy(octable,odctable: length(ntable) alloc_if(1) free_if(0))
       }
       #endif
     }
   }
   _ntypes=ntypes;
   _ntable=ntable;
   _memory=memory;
 }
 
 
diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp
index 8c63d2e62..05a28eb7f 100644
--- a/src/USER-INTEL/pair_buck_intel.cpp
+++ b/src/USER-INTEL/pair_buck_intel.cpp
@@ -1,495 +1,495 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
    certain rights in this software.  This software is distributed under
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: Rodrigo Canales (RWTH Aachen University)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include "pair_buck_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "group.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "math_const.h"
 #include "memory.h"
 #include "suffix.h"
 #include "force.h"
 #include "modify.h"
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
 #define C_FORCE_T typename ForceConst<flt_t>::c_force_t
 #define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
 
 PairBuckIntel::PairBuckIntel(LAMMPS *lmp) : PairBuck(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 PairBuckIntel::~PairBuckIntel()
 {
 }
 
 void PairBuckIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairBuckIntel::compute(int eflag, int vflag,
                             IntelBuffers<flt_t,acc_t> *buffers,
                             const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
     #if defined(_OPENMP)
     #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
                                 packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
   if (eflag) {
     if (force->newton_pair) {
       eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
       eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckIntel::eval(const int offload, const int vflag,
                          IntelBuffers<flt_t,acc_t> *buffers,
                          const ForceConst<flt_t> &fc,
                          const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const flt_t * _noalias const special_lj = fc.special_lj;
   const C_FORCE_T * _noalias const c_force = fc.c_force[0];
   const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0];
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
   // Redeclare as local variables for offload
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload)                 \
     in(special_lj:length(0) alloc_if(0) free_if(0)) \
     in(c_force, c_energy:length(0) alloc_if(0) free_if(0))      \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
     in(f_stride,nlocal,minlocal,separate_flag,offload) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl =  (acc_t)0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       int foff;
       if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
       else foff = -minlocal;
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
         const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl,  sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
         if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
 
           flt_t  forcebuck, evdwl;
           forcebuck = evdwl =  (flt_t)0.0;
 
           const int sbindex = jlist[jj] >> SBBITS & 3;
           const int j = jlist[jj] & NEIGHMASK;
 
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
           const flt_t r = sqrt(rsq);
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
           #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cutsq) {
           #endif
             const flt_t r6inv = r2inv * r2inv * r2inv;
             const flt_t rexp = exp(-r * c_forcei[jtype].rhoinv);
             forcebuck = r * rexp * c_forcei[jtype].buck1 -
               r6inv * c_forcei[jtype].buck2;
 
             #ifndef INTEL_VMASK
             if (rsq > c_forcei[jtype].cutsq)
               forcebuck =(flt_t)0.0;
             #endif
             if (EFLAG) {
               evdwl = rexp * c_energyi[jtype].a -
                 r6inv * c_energyi[jtype].c -
                 c_energyi[jtype].offset;
 
               #ifndef INTEL_VMASK
               if (rsq > c_forcei[jtype].cutsq)
                 evdwl =(flt_t)0.0;
               #endif
             }
 
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcebuck *= factor_lj;
               if (EFLAG)
                 evdwl *= factor_lj;
             }
             const flt_t fpair =  forcebuck * r2inv;
             const flt_t fpx = fpair * delx;
             fxtmp += fpx;
             if (NEWTON_PAIR) f[j].x -= fpx;
             const flt_t fpy = fpair * dely;
             fytmp += fpy;
             if (NEWTON_PAIR) f[j].y -= fpy;
             const flt_t fpz = fpair * delz;
             fztmp += fpz;
             if (NEWTON_PAIR) f[j].z -= fpz;
 
             if (EFLAG) {
               sevdwl += evdwl;
               if (eatom) {
                 fwtmp += (flt_t)0.5 * evdwl;
                 if (NEWTON_PAIR)
                   f[j].w += (flt_t)0.5 * evdwl;
               }
             }
             if (NEWTON_PAIR == 0)
               IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
         if (NEWTON_PAIR) {
           f[i].x += fxtmp;
           f[i].y += fytmp;
           f[i].z += fztmp;
         } else {
           f[i].x = fxtmp;
           f[i].y = fytmp;
           f[i].z = fztmp;
         }
         IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
                               f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
                               ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
       ev_global[0] = oevdwl;
       ev_global[1] = (acc_t)0;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
         ov3 *= (acc_t)0.5;
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end of offload region
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 void PairBuckIntel::init_style()
 {
   PairBuck::init_style();
   if (force->newton_pair == 0) {
     neighbor->requests[neighbor->nrequest-1]->half = 0;
     neighbor->requests[neighbor->nrequest-1]->full = 1;
   }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 template <class flt_t, class acc_t>
 void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
   int tp1 = atom->ntypes + 1;
 
   fc.set_ntypes(tp1, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i, j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_lj[0] = 1.0;
   }
 
-  for (int i = 0; i < tp1; i++) {
-    for (int j = 0; j < tp1; j++) {
+  for (int i = 1; i < tp1; i++) {
+    for (int j = 1; j < tp1; j++) {
       fc.c_force[i][j].buck1 = buck1[i][j];
       fc.c_force[i][j].buck2 = buck2[i][j];
       fc.c_force[i][j].rhoinv = rhoinv[i][j];
       fc.c_force[i][j].cutsq = cutsq[i][j];
       fc.c_energy[i][j].a = a[i][j];
       fc.c_energy[i][j].c = c[i][j];
       fc.c_energy[i][j].offset = offset[i][j];
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   flt_t * special_lj = fc.special_lj;
   C_FORCE_T * c_force = fc.c_force[0];
   C_ENERGY_T * c_energy = fc.c_energy[0];
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj: length(4) alloc_if(0) free_if(0)) \
     in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0))   \
     in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                   Memory *memory,
                                                   const int cop) {
   if ( (ntypes != _ntypes ) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
 
       if (ospecial_lj != NULL && oc_force != NULL &&
           oc_energy != NULL  &&
           _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: alloc_if(0) free_if(1)) \
           nocopy(oc_force, oc_energy: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(c_force);
       _memory->destroy(c_energy);
 
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(c_force,ntypes,ntypes,"fc.c_force");
       memory->create(c_energy,ntypes,ntypes,"fc.c_energy");
 
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oc_force != NULL &&
           oc_energy != NULL &&
           cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(oc_force: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_energy: length(tp1sq) alloc_if(1) free_if(0))
 
       }
       #endif
     }
   }
   _ntypes=ntypes;
   _memory=memory;
 }
 
 
diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp
index c7cddfccc..09f27504a 100644
--- a/src/USER-INTEL/pair_dpd_intel.cpp
+++ b/src/USER-INTEL/pair_dpd_intel.cpp
@@ -1,617 +1,617 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    This software is distributed under the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
                         Shun Xu (Computer Network Information Center, CAS)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include "pair_dpd_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define LMP_MKL_RNG VSL_BRNG_MT19937
 #define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
 #define IEPSILON 1.0e10
 
 /* ---------------------------------------------------------------------- */
 
 PairDPDIntel::PairDPDIntel(LAMMPS *lmp) :
   PairDPD(lmp)
 {
   suffix_flag |= Suffix::INTEL;
   respa_enable = 0;
   random_thread = NULL;
   _nrandom_thread = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairDPDIntel::~PairDPDIntel()
 {
   #if defined(_OPENMP)
   if (_nrandom_thread) {
     #ifdef LMP_USE_MKL_RNG
     for (int i = 0; i < _nrandom_thread; i++)
       vslDeleteStream(&random_thread[i]);
     #else
     for (int i = 1; i < _nrandom_thread; i++)
       delete random_thread[i];
     #endif
   }
   #endif
   delete []random_thread;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairDPDIntel::compute(int eflag, int vflag)
 {
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairDPDIntel::compute(int eflag, int vflag,
                            IntelBuffers<flt_t,acc_t> *buffers,
                            const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag, vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
     #if defined(_OPENMP)
     #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
                                 packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
   if (_onetype) {
     if (eflag) {
       if (force->newton_pair) {
         eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
         eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
         eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
         eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
         eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
         eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
         eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
         eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
     if (eflag) {
       if (force->newton_pair) {
         eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
         eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
         eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
         eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
         eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
         eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
         eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
         eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   }
 }
 
 template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairDPDIntel::eval(const int offload, const int vflag,
                         IntelBuffers<flt_t,acc_t> *buffers,
                         const ForceConst<flt_t> &fc,
                         const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   typedef struct { double x, y, z; } lmp_vt;
   lmp_vt *v = (lmp_vt *)atom->v[0];
   const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
   const FC_PACKED1_T * _noalias const param = fc.param[0];
   const flt_t * _noalias const special_lj = fc.special_lj;
   int * _noalias const rngi_thread = fc.rngi;
   const int rng_size = buffers->get_max_nbors();
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
   int *overflow = fix->get_off_overflow_flag();
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = (acc_t)0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       #ifdef LMP_USE_MKL_RNG
       VSLStreamStatePtr *my_random = &(random_thread[tid]);
       #else
       RanMars *my_random = random_thread[tid];
       #endif
       flt_t *my_rand_buffer = fc.rand_buffer_thread[tid];
       int rngi = rngi_thread[tid];
 
       int foff;
       if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
       else foff = -minlocal;
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       flt_t icut, a0, gamma, sigma;
       if (ONETYPE) {
         icut = param[3].icut;
         a0 = param[3].a0;
         gamma = param[3].gamma;
         sigma = param[3].sigma;
       }
       for (int i = iifrom; i < iito; i += iip) {
         int itype, ptr_off;
         const FC_PACKED1_T * _noalias parami;
         if (!ONETYPE) {
           itype = x[i].w;
           ptr_off = itype * ntypes;
           parami = param + ptr_off;
         }
 
         const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
         acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
 	const flt_t vxtmp = v[i].x;
 	const flt_t vytmp = v[i].y;
 	const flt_t vztmp = v[i].z;
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl = (acc_t)0;
         if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
 	if (rngi + jnum > rng_size) {
           #ifdef LMP_USE_MKL_RNG
 	  if (sizeof(flt_t) == sizeof(float))
 	    vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
 			  (float*)my_rand_buffer, (float)0.0, (float)1.0 );
 	  else
 	    vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
 	  		  (double*)my_rand_buffer, 0.0, 1.0 );
           #else
           for (int jj = 0; jj < rngi; jj++)
             my_rand_buffer[jj] = my_random->gaussian();
           #endif
 	  rngi = 0;
 	}
 
         #if defined(LMP_SIMD_COMPILER)
 	#pragma vector aligned
 	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
 	                         sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcelj, evdwl;
           forcelj = evdwl = (flt_t)0.0;
 
           int j, jtype, sbindex;
           if (!ONETYPE) {
             sbindex = jlist[jj] >> SBBITS & 3;
             j = jlist[jj] & NEIGHMASK;
           } else
             j = jlist[jj];
 
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           if (!ONETYPE) {
             jtype = x[j].w;
             icut = parami[jtype].icut;
           }
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 	  const flt_t rinv = (flt_t)1.0/sqrt(rsq);
 
           if (rinv > icut) {
             flt_t factor_dpd;
             if (!ONETYPE) factor_dpd = special_lj[sbindex];
 
 	    flt_t delvx = vxtmp - v[j].x;
 	    flt_t delvy = vytmp - v[j].y;
 	    flt_t delvz = vztmp - v[j].z;
 	    flt_t dot = delx*delvx + dely*delvy + delz*delvz;
 	    flt_t randnum = my_rand_buffer[jj];
 
 	    flt_t iwd = rinv - icut;
 	    if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0;
 
 	    if (!ONETYPE) {
 	      a0 = parami[jtype].a0;
 	      gamma = parami[jtype].gamma;
 	      sigma = parami[jtype].sigma;
 	    }
 	    flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt;
 	    if (!ONETYPE) fpair *= factor_dpd;
 	    fpair *= iwd;
 
             const flt_t fpx = fpair * delx;
             fxtmp += fpx;
             if (NEWTON_PAIR) f[j].x -= fpx;
             const flt_t fpy = fpair * dely;
             fytmp += fpy;
             if (NEWTON_PAIR) f[j].y -= fpy;
             const flt_t fpz = fpair * delz;
             fztmp += fpz;
             if (NEWTON_PAIR) f[j].z -= fpz;
 
             if (EFLAG) {
 	      flt_t cut = (flt_t)1.0/icut;
 	      flt_t r = (flt_t)1.0/rinv;
 	      evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut);
 	      if (!ONETYPE) evdwl *= factor_dpd;
               sevdwl += evdwl;
               if (eatom) {
                 fwtmp += (flt_t)0.5 * evdwl;
                 if (NEWTON_PAIR)
                   f[j].w += (flt_t)0.5 * evdwl;
               }
             }
 
             if (NEWTON_PAIR == 0)
               IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           } // if rsq
         } // for jj
         if (NEWTON_PAIR) {
           f[i].x += fxtmp;
           f[i].y += fytmp;
           f[i].z += fztmp;
         } else {
           f[i].x = fxtmp;
           f[i].y = fytmp;
           f[i].z = fztmp;
         }
 
         IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
 	rngi += jnum;
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
                               f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
                               ov4, ov5);
       rngi_thread[tid] = rngi;
     } // end omp
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
       ev_global[0] = oevdwl;
       ev_global[1] = (acc_t)0.0;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
         ov3 *= (acc_t)0.5;
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end offload
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 /* ----------------------------------------------------------------------
    global settings
    ------------------------------------------------------------------------- */
 
 void PairDPDIntel::settings(int narg, char **arg) {
   #if defined(_OPENMP)
   if (_nrandom_thread) {
     #ifdef LMP_USE_MKL_RNG
     for (int i = 0; i < _nrandom_thread; i++)
       vslDeleteStream(&random_thread[i]);
     #else
     for (int i = 1; i < _nrandom_thread; i++)
       delete random_thread[i];
     #endif
   }
   delete []random_thread;
   #endif
   PairDPD::settings(narg,arg);
   _nrandom_thread = comm->nthreads;
 
   #ifdef LMP_USE_MKL_RNG
 
   random_thread=new VSLStreamStatePtr[comm->nthreads];
   #if defined(_OPENMP)
   #pragma omp parallel
   {
     int tid = omp_get_thread_num();
     vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
 		 seed + comm->me + comm->nprocs * tid );
   }
   #endif
 
   #else
 
   random_thread =new RanMars*[comm->nthreads];
   random_thread[0] = random;
   #if defined(_OPENMP)
   #pragma omp parallel
   {
     int tid = omp_get_thread_num();
     if (tid > 0)
       random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
   }
   #endif
 
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairDPDIntel::init_style()
 {
   PairDPD::init_style();
   if (force->newton_pair == 0) {
     neighbor->requests[neighbor->nrequest-1]->half = 0;
     neighbor->requests[neighbor->nrequest-1]->full = 1;
   }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   if (fix->offload_balance() != 0.0)
     error->all(FLERR,
           "Offload for dpd/intel is not yet available. Set balance to 0.");
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
                                     IntelBuffers<flt_t,acc_t> *buffers)
 {
   _onetype = 0;
   if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
 
   int tp1 = atom->ntypes + 1;
   fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
         double icut = 1.0 / cut;
         fc.param[i][j].icut = fc.param[j][i].icut = icut;
       } else {
         cut = init_one(i,j);
         double icut = 1.0 / cut;
         fc.param[i][j].icut = fc.param[j][i].icut = icut;
       }
     }
   }
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_lj[0] = 1.0;
   }
 
-  for (int i = 0; i < tp1; i++) {
-    for (int j = 0; j < tp1; j++) {
+  for (int i = 1; i < tp1; i++) {
+    for (int j = 1; j < tp1; j++) {
       fc.param[i][j].a0 = a0[i][j];
       fc.param[i][j].gamma = gamma[i][j];
       fc.param[i][j].sigma = sigma[i][j];
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairDPDIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                  const int nthreads,
 						 const int max_nbors,
                                                  Memory *memory,
                                                  const int cop) {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
       _memory->destroy(param);
       _memory->destroy(rand_buffer_thread);
       _memory->destroy(rngi);
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(param,ntypes,ntypes,"fc.param");
       memory->create(rand_buffer_thread, nthreads, max_nbors, 
 		     "fc.rand_buffer_thread");
       memory->create(rngi,nthreads,"fc.param");
       for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors;
     }
   }
   _ntypes = ntypes;
   _memory = memory;
 }
 
 /* ----------------------------------------------------------------------
    proc 0 reads from restart file, bcasts
    ------------------------------------------------------------------------- */
 
 void PairDPDIntel::read_restart_settings(FILE *fp)
 {
   #if defined(_OPENMP)
   if (_nrandom_thread) {
     #ifdef LMP_USE_MKL_RNG
     for (int i = 0; i < _nrandom_thread; i++)
       vslDeleteStream(&random_thread[i]);
     #else
     for (int i = 1; i < _nrandom_thread; i++)
       delete random_thread[i];
     #endif
   }
   delete []random_thread;
   #endif
   PairDPD::read_restart_settings(fp);
   _nrandom_thread = comm->nthreads;
 
   #ifdef LMP_USE_MKL_RNG
 
   random_thread=new VSLStreamStatePtr[comm->nthreads];
   #if defined(_OPENMP)
   #pragma omp parallel
   {
     int tid = omp_get_thread_num();
     vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
 		 seed + comm->me + comm->nprocs * tid );
   }
   #endif
 
   #else
 
   random_thread =new RanMars*[comm->nthreads];
   random_thread[0] = random;
   #if defined(_OPENMP)
   #pragma omp parallel
   {
     int tid = omp_get_thread_num();
     if (tid > 0)
       random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
   }
   #endif
 
   #endif
 }
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
index 3fbb58308..1f05ad0ef 100644
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -1,1079 +1,1079 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    This software is distributed under the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include "pair_gayberne_intel.h"
 #include "math_extra_intel.h"
 
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
 #include <cmath>
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif
 
 #include "atom.h"
 #include "comm.h"
 #include "atom_vec_ellipsoid.h"
 #include "force.h"
 #include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
 #define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
 #define FC_PACKED3_T typename ForceConst<flt_t>::fc_packed3
 
 /* ---------------------------------------------------------------------- */
 
 PairGayBerneIntel::PairGayBerneIntel(LAMMPS *lmp) :
   PairGayBerne(lmp)
 {
   suffix_flag |= Suffix::INTEL;
   respa_enable = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairGayBerneIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairGayBerneIntel::compute(int eflag, int vflag,
                                 IntelBuffers<flt_t,acc_t> *buffers,
                                 const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag, vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nall = atom->nlocal + atom->nghost;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
     const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
     const int * const ellipsoid = atom->ellipsoid;
     QUAT_T * _noalias const quat = buffers->get_quat();
 
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
     #if defined(_OPENMP)
     #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads,
                                 sizeof(ATOM_T));
       if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
 
       for (int i = ifrom; i < ito; i++) {
         int qi = ellipsoid[i];
         if (qi > -1) {
           quat[i].w = bonus[qi].quat[0];
           quat[i].i = bonus[qi].quat[1];
           quat[i].j = bonus[qi].quat[2];
           quat[i].k = bonus[qi].quat[3];
         }
       }
     }
     quat[nall].w = (flt_t)1.0;
     quat[nall].i = (flt_t)0.0;
     quat[nall].j = (flt_t)0.0;
     quat[nall].k = (flt_t)0.0;
     fix->stop_watch(TIME_PACK);
   }
 
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
   if (eflag) {
     if (force->newton_pair) {
       eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
       eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairGayBerneIntel::eval(const int offload, const int vflag,
                              IntelBuffers<flt_t,acc_t> *buffers,
                              const ForceConst<flt_t> &fc,
                              const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   ATOM_T * _noalias const x = buffers->get_x(offload);
   QUAT_T * _noalias const quat = buffers->get_quat(offload);
   const AtomVecEllipsoid::Bonus *bonus = avec->bonus;
   const int *ellipsoid = atom->ellipsoid;
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (fix->separate_buffers()) {
     fix->start_watch(TIME_PACK);
     if (offload) {
       #pragma omp parallel
       {
         int ifrom, ito, tid;
         int nthreads = comm->nthreads;
         IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
                                   nthreads, sizeof(ATOM_T));
         if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
         for (int i = ifrom; i < ito; i++) {
           int qi = ellipsoid[i];
           if (qi > -1) {
             quat[i].w = bonus[qi].quat[0];
             quat[i].i = bonus[qi].quat[1];
             quat[i].j = bonus[qi].quat[2];
             quat[i].k = bonus[qi].quat[3];
           }
         }
         int nghost = nall - nlocal;
         if (nghost) {
           IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
                                  nthreads, sizeof(ATOM_T));
           int offset = 0;
           ifrom += nlocal;
           ito += nlocal;
           if (ago != 0) {
             offset = fix->offload_min_ghost() - nlocal;
             buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
           }
           for (int i = ifrom; i < ito; i++) {
             int qi = ellipsoid[i + offset];
             if (qi > -1) {
               quat[i].w = bonus[qi].quat[0];
               quat[i].i = bonus[qi].quat[1];
               quat[i].j = bonus[qi].quat[2];
               quat[i].k = bonus[qi].quat[3];
             }
           }
         }
       }
     } else {
       if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
       for (int i = fix->host_min_local(); i < nlocal; i++) {
         int qi = ellipsoid[i];
         if (qi > -1) {
           quat[i].w = bonus[qi].quat[0];
           quat[i].i = bonus[qi].quat[1];
           quat[i].j = bonus[qi].quat[2];
           quat[i].k = bonus[qi].quat[3];
         }
       }
       int offset = fix->host_min_ghost() - nlocal;
       if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
       for (int i = nlocal; i < nall; i++) {
         int qi = ellipsoid[i + offset];
         if (qi > -1) {
           quat[i].w = bonus[qi].quat[0];
           quat[i].i = bonus[qi].quat[1];
           quat[i].j = bonus[qi].quat[2];
           quat[i].k = bonus[qi].quat[3];
         }
       }
     }
     fix->stop_watch(TIME_PACK);
   }
   #endif
 
   //  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
   const flt_t * _noalias const special_lj = fc.special_lj;
 
   const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
   const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
   const FC_PACKED3_T * _noalias const ic = fc.ic;
   const flt_t mu = fc.mu;
   const flt_t gamma = fc.gamma;
   const flt_t upsilon = fc.upsilon;
 
   flt_t * const rsq_formi = fc.rsq_form[0];
   flt_t * const delx_formi = fc.delx_form[0];
   flt_t * const dely_formi = fc.dely_form[0];
   flt_t * const delz_formi = fc.delz_form[0];
   int * const jtype_formi = fc.jtype_form[0];
   int * const jlist_formi = fc.jlist_form[0];
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
   const int max_nbors = _max_nbors;
   const int nthreads = tc;
 
   int pad = 1;
   if (offload) {
     if (INTEL_MIC_NBOR_PAD > 1)
       pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
   } else {
     if (INTEL_NBOR_PAD > 1)
       pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
   }
   const int pad_width = pad;
 
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload) \
     in(special_lj:length(0) alloc_if(0) free_if(0)) \
     in(ijc,lj34,ic:length(0) alloc_if(0) free_if(0)) \
     in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \
     in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(quat:length(nall+1) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(nthreads,inum,nall,ntypes,vflag,eatom,minlocal,separate_flag) \
     in(astart,nlocal,f_stride,max_nbors,mu,gamma,upsilon,offload,pad_width) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute=MIC_Wtime();
     #endif
 
     #ifdef _LMP_INTEL_OFFLOAD
     if (separate_flag) {
       if (separate_flag < 3) {
         int all_local = nlocal;
         int ghost_min = overflow[LMP_GHOST_MIN];
         nlocal = overflow[LMP_LOCAL_MAX] + 1;
         int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
         if (nghost < 0) nghost = 0;
         nall = nlocal + nghost;
         separate_flag--;
         int flength;
         if (NEWTON_PAIR) flength = nall;
         else flength = nlocal;
         IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
                              separate_flag);
         if (nghost) {
           if (nlocal < all_local || ghost_min > all_local) {
             memmove(x + nlocal, x + ghost_min,
                     (nall - nlocal) * sizeof(ATOM_T));
             memmove(quat + nlocal, quat + ghost_min,
                     (nall - nlocal) * sizeof(QUAT_T));
           }
         }
       }
       x[nall].x = (flt_t)INTEL_BIGP;
       x[nall].y = (flt_t)INTEL_BIGP;
       x[nall].z = (flt_t)INTEL_BIGP;
       x[nall].w = 1;
       quat[nall].w = (flt_t)1.0;
       quat[nall].i = (flt_t)0.0;
       quat[nall].j = (flt_t)0.0;
       quat[nall].k = (flt_t)0.0;
     }
     #endif
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = (acc_t)0.0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
     if (NEWTON_PAIR == 0) f_start[1].w = 0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       int foff;
       if (NEWTON_PAIR) foff = tid * f_stride - minlocal * 2;
       else foff = minlocal*-2;
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
 
       flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
       flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
       flt_t * _noalias const dely_form = dely_formi + tid * max_nbors;
       flt_t * _noalias const delz_form = delz_formi + tid * max_nbors;
       int * _noalias const jtype_form = jtype_formi + tid * max_nbors;
       int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
 
       int ierror = 0;
       for (int i = iifrom; i < iito; i += iip) {
         // const int i = ilist[ii];
         const int itype = x[i].w;
         const int ptr_off = itype * ntypes;
         const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
         const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
 
         const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
 
         flt_t a1_0, a1_1, a1_2, a1_3, a1_4, a1_5, a1_6, a1_7, a1_8;
         flt_t b1_0, b1_1, b1_2, b1_3, b1_4, b1_5, b1_6, b1_7, b1_8;
         flt_t g1_0, g1_1, g1_2, g1_3, g1_4, g1_5, g1_6, g1_7, g1_8;
 
         if (ijci[itype].form == ELLIPSE_ELLIPSE) {
           flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
           ME_quat_to_mat_trans(quat[i],a1);
           ME_diag_times3(ic[itype].well,a1,temp);
           ME_transpose_times3(a1,temp,b1);
           ME_diag_times3(ic[itype].shape2,a1,temp);
           ME_transpose_times3(a1,temp,g1);
         }
 
         acc_t fxtmp, fytmp, fztmp, fwtmp, t1tmp, t2tmp, t3tmp;
         acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
         fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
 
         if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
         if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
 
         bool multiple_forms = false;
         int packed_j = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           int jm = jlist[jj];
           int j = jm & NEIGHMASK;
           const int jtype = x[j].w;
 
           if (ijci[jtype].form == ELLIPSE_ELLIPSE) {
             flt_t delx = x[j].x-xtmp;
             flt_t dely = x[j].y-ytmp;
             flt_t delz = x[j].z-ztmp;
             flt_t rsq = delx * delx + dely * dely + delz * delz;
 
             if (rsq < ijci[jtype].cutsq) {
               rsq_form[packed_j] = rsq;
               delx_form[packed_j] = delx;
               dely_form[packed_j] = dely;
               delz_form[packed_j] = delz;
               jtype_form[packed_j] = jtype;
               jlist_form[packed_j] = jm;
               packed_j++;
             }
           } else
             multiple_forms = true;
         }
         const int edge = packed_j & (pad_width - 1);
         if (edge) {
           const int packed_end = packed_j + (pad_width - edge);
           #if defined(LMP_SIMD_COMPILER)
           #pragma loop_count min=1, max=15, avg=8
           #endif
           for ( ; packed_j < packed_end; packed_j++)
             jlist_form[packed_j] = nall;
         }
 
         // -------------------------------------------------------------
 
         #ifdef INTEL_V512
         __assume(packed_j % INTEL_VECTOR_WIDTH == 0);
         __assume(packed_j % 8 == 0);
         __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
         #endif
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
                                  sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
         #endif
         for (int jj = 0; jj < packed_j; jj++) {
           flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
           flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8;
           flt_t g2_0, g2_1, g2_2, g2_3, g2_4, g2_5, g2_6, g2_7, g2_8;
           flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
           flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
           flt_t rtor_0, rtor_1, rtor_2;
 
           const int sbindex = jlist_form[jj] >> SBBITS & 3;
           const int j = jlist_form[jj] & NEIGHMASK;
           flt_t factor_lj = special_lj[sbindex];
           const int jtype = jtype_form[jj];
           const flt_t sigma = ijci[jtype].sigma;
           const flt_t epsilon = ijci[jtype].epsilon;
           const flt_t shape2_0 = ic[jtype].shape2[0];
           const flt_t shape2_1 = ic[jtype].shape2[1];
           const flt_t shape2_2 = ic[jtype].shape2[2];
           flt_t one_eng, evdwl;
 
           ME_quat_to_mat_trans(quat[j], a2);
           ME_diag_times3(ic[jtype].well, a2, temp);
           ME_transpose_times3(a2, temp, b2);
           ME_diag_times3a(shape2, a2, temp);
           ME_transpose_times3(a2, temp, g2);
 
           flt_t tempv_0, tempv_1, tempv_2, tempv2_0, tempv2_1, tempv2_2;
           flt_t temp1, temp2, temp3;
 
           flt_t r12hat_0, r12hat_1, r12hat_2;
           ME_normalize3(delx_form[jj], dely_form[jj], delz_form[jj], r12hat);
           flt_t r = sqrt(rsq_form[jj]);
 
           // compute distance of closest approach
 
           flt_t g12_0, g12_1, g12_2, g12_3, g12_4, g12_5, g12_6, g12_7, g12_8;
           ME_plus3(g1, g2, g12);
           flt_t kappa_0, kappa_1, kappa_2;
           ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
                        kappa, ierror);
 
           // tempv = G12^-1*r12hat
 
           flt_t inv_r = (flt_t)1.0 / r;
           tempv_0 = kappa_0 * inv_r;
           tempv_1 = kappa_1 * inv_r;
           tempv_2 = kappa_2 * inv_r;
           flt_t sigma12 = ME_dot3(r12hat, tempv);
           sigma12 = std::pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
           flt_t h12 = r - sigma12;
 
           // energy
           // compute u_r
 
           flt_t varrho = sigma / (h12 + gamma * sigma);
           flt_t varrho6 = std::pow(varrho, (flt_t)6.0);
           flt_t varrho12 = varrho6 * varrho6;
           flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
 
           // compute eta_12
 
           flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
           flt_t det_g12 = ME_det3(g12);
           eta = std::pow(eta / det_g12, upsilon);
 
           // compute chi_12
 
           flt_t b12_0, b12_1, b12_2, b12_3, b12_4, b12_5, b12_6, b12_7, b12_8;
           flt_t iota_0, iota_1, iota_2;
           ME_plus3(b1, b2, b12);
           ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
                        iota, ierror);
 
           // tempv = G12^-1*r12hat
 
           tempv_0 = iota_0 * inv_r;
           tempv_1 = iota_1 * inv_r;
           tempv_2 = iota_2 * inv_r;
           flt_t chi = ME_dot3(r12hat, tempv);
           chi = std::pow(chi * (flt_t)2.0, mu);
 
           // force
           // compute dUr/dr
 
           temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
             sigma;
           temp1 = temp1 * (flt_t)24.0 * epsilon;
           flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
           flt_t dUr_0, dUr_1, dUr_2;
           temp2 = ME_dot3(kappa, r12hat);
           flt_t uslj_rsq = u_slj / rsq_form[jj];
           dUr_0 = temp1 * r12hat_0 + uslj_rsq * (kappa_0 - temp2 * r12hat_0);
           dUr_1 = temp1 * r12hat_1 + uslj_rsq * (kappa_1 - temp2 * r12hat_1);
           dUr_2 = temp1 * r12hat_2 + uslj_rsq * (kappa_2 - temp2 * r12hat_2);
 
           // compute dChi_12/dr
 
           flt_t dchi_0, dchi_1, dchi_2;
           temp1 = ME_dot3(iota, r12hat);
           temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
             std::pow(chi, (mu - (flt_t)1.0) / mu);
           dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
           dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
           dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
 
           temp1 = -eta * u_r;
           temp2 = eta * chi;
           fforce_0 = temp1 * dchi_0 - temp2 * dUr_0;
           fforce_1 = temp1 * dchi_1 - temp2 * dUr_1;
           fforce_2 = temp1 * dchi_2 - temp2 * dUr_2;
 
           // torque for particle 1 and 2
           // compute dUr
 
           tempv_0 = -uslj_rsq * kappa_0;
           tempv_1 = -uslj_rsq * kappa_1;
           tempv_2 = -uslj_rsq * kappa_2;
           ME_vecmat(kappa, g1, tempv2);
           ME_cross3(tempv, tempv2, dUr);
           flt_t dUr2_0, dUr2_1, dUr2_2;
 
           if (NEWTON_PAIR) {
             ME_vecmat(kappa, g2, tempv2);
             ME_cross3(tempv, tempv2, dUr2);
           }
 
           // compute d_chi
 
           ME_vecmat(iota, b1, tempv);
           ME_cross3(tempv, iota, dchi);
           temp1 = (flt_t)-4.0 / rsq_form[jj];
           dchi_0 *= temp1;
           dchi_1 *= temp1;
           dchi_2 *= temp1;
           flt_t dchi2_0, dchi2_1, dchi2_2;
 
           if (NEWTON_PAIR) {
             ME_vecmat(iota, b2, tempv);
             ME_cross3(tempv, iota, dchi2);
             dchi2_0 *= temp1;
             dchi2_1 *= temp1;
             dchi2_2 *= temp1;
           }
 
           // compute d_eta
 
           flt_t deta_0, deta_1, deta_2;
           deta_0 = deta_1 = deta_2 = (flt_t)0.0;
           ME_compute_eta_torque(g12, a1, shape2, temp);
           temp1 = -eta * upsilon;
 
           tempv_0 = temp1 * temp_0;
           tempv_1 = temp1 * temp_1;
           tempv_2 = temp1 * temp_2;
           ME_mv0_cross3(a1, tempv, tempv2);
           deta_0 += tempv2_0;
           deta_1 += tempv2_1;
           deta_2 += tempv2_2;
 
           tempv_0 = temp1 * temp_3;
           tempv_1 = temp1 * temp_4;
           tempv_2 = temp1 * temp_5;
           ME_mv1_cross3(a1, tempv, tempv2);
           deta_0 += tempv2_0;
           deta_1 += tempv2_1;
           deta_2 += tempv2_2;
 
           tempv_0 = temp1 * temp_6;
           tempv_1 = temp1 * temp_7;
           tempv_2 = temp1 * temp_8;
           ME_mv2_cross3(a1, tempv, tempv2);
           deta_0 += tempv2_0;
           deta_1 += tempv2_1;
           deta_2 += tempv2_2;
 
           // compute d_eta for particle 2
 
           flt_t deta2_0, deta2_1, deta2_2;
           if (NEWTON_PAIR) {
             deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
             ME_compute_eta_torque(g12, a2, shape2, temp);
 
             tempv_0 = temp1 * temp_0;
             tempv_1 = temp1 * temp_1;
             tempv_2 = temp1 * temp_2;
             ME_mv0_cross3(a2, tempv, tempv2);
             deta2_0 += tempv2_0;
             deta2_1 += tempv2_1;
             deta2_2 += tempv2_2;
 
             tempv_0 = temp1 * temp_3;
             tempv_1 = temp1 * temp_4;
             tempv_2 = temp1 * temp_5;
             ME_mv1_cross3(a2, tempv, tempv2);
             deta2_0 += tempv2_0;
             deta2_1 += tempv2_1;
             deta2_2 += tempv2_2;
 
             tempv_0 = temp1 * temp_6;
             tempv_1 = temp1 * temp_7;
             tempv_2 = temp1 * temp_8;
             ME_mv2_cross3(a2, tempv, tempv2);
             deta2_0 += tempv2_0;
             deta2_1 += tempv2_1;
             deta2_2 += tempv2_2;
           }
 
           // torque
 
           temp1 = u_r * eta;
           temp2 = u_r * chi;
           temp3 = chi * eta;
 
           ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) *
             (flt_t)-1.0;
           ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) *
             (flt_t)-1.0;
           ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
             (flt_t)-1.0;
 
           if (NEWTON_PAIR) {
             rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
               (flt_t)-1.0;
             rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
               (flt_t)-1.0;
             rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) *
               (flt_t)-1.0;
           }
 
           one_eng = temp1 * chi;
           #ifndef INTEL_VMASK
           if (jlist_form[jj] == nall) {
             one_eng = (flt_t)0.0;
             fforce_0 = 0.0;
             fforce_1 = 0.0;
             fforce_2 = 0.0;
             ttor_0 = 0.0;
             ttor_1 = 0.0;
             ttor_2 = 0.0;
             rtor_0 = 0.0;
             rtor_1 = 0.0;
             rtor_2 = 0.0;
           }
           #endif
 
           fforce_0 *= factor_lj;
           fforce_1 *= factor_lj;
           fforce_2 *= factor_lj;
           ttor_0 *= factor_lj;
           ttor_1 *= factor_lj;
           ttor_2 *= factor_lj;
 
           #ifdef INTEL_VMASK
           if (jlist_form[jj] < nall) {
           #endif
             fxtmp += fforce_0;
             fytmp += fforce_1;
             fztmp += fforce_2;
             t1tmp += ttor_0;
             t2tmp += ttor_1;
             t3tmp += ttor_2;
 
             if (NEWTON_PAIR) {
               rtor_0 *= factor_lj;
               rtor_1 *= factor_lj;
               rtor_2 *= factor_lj;
               int jp = j * 2;
               f[jp].x -= fforce_0;
               f[jp].y -= fforce_1;
               f[jp].z -= fforce_2;
               jp++;
               f[jp].x += rtor_0;
               f[jp].y += rtor_1;
               f[jp].z += rtor_2;
             }
 
             if (EFLAG) {
               evdwl = factor_lj * one_eng;
               sevdwl += evdwl;
               if (eatom) {
                 fwtmp += (flt_t)0.5 * evdwl;
                 if (NEWTON_PAIR)
                   f[j*2].w += (flt_t)0.5 * evdwl;
               }
             }
 
             if (NEWTON_PAIR == 0) {
               if (vflag == 1) {
                 sv0 += delx_form[jj] * fforce_0;
                 sv1 += dely_form[jj] * fforce_1;
                 sv2 += delz_form[jj] * fforce_2;
                 sv3 += delx_form[jj] * fforce_1;
                 sv4 += delx_form[jj] * fforce_2;
                 sv5 += dely_form[jj] * fforce_2;
               }
             } // EVFLAG
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
 
         // -------------------------------------------------------------
 
         if (multiple_forms)
           ierror = 2;
 
         int ip = i * 2;
         if (NEWTON_PAIR) {
           f[ip].x += fxtmp;
           f[ip].y += fytmp;
           f[ip].z += fztmp;
           ip++;
           f[ip].x += t1tmp;
           f[ip].y += t2tmp;
           f[ip].z += t3tmp;
         } else {
           f[ip].x = fxtmp;
           f[ip].y = fytmp;
           f[ip].z = fztmp;
           ip++;
           f[ip].x = t1tmp;
           f[ip].y = t2tmp;
           f[ip].z = t3tmp;
         }
 
         if (EFLAG) {
           oevdwl += sevdwl;
           if (eatom) f[i * 2].w += fwtmp;
         }
         if (NEWTON_PAIR == 0) {
           if (vflag == 1) {
             ov0 += sv0;
             ov1 += sv1;
             ov2 += sv2;
             ov3 += sv3;
             ov4 += sv4;
             ov5 += sv5;
           }
         }
       } // for i
       int o_range;
       if (NEWTON_PAIR) {
         o_range = nall;
         if (offload == 0) o_range -= minlocal;
         IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
                                sizeof(FORCE_T));
         const int sto = iito * 8;
         const int fst4 = f_stride * 4;
         #if defined(_OPENMP)
         #pragma omp barrier
         #endif
         acc_t *f_scalar = &f_start[0].x;
         acc_t *f_scalar2 = f_scalar + fst4;
         for (int t = 1; t < nthreads; t++) {
           #if defined(LMP_SIMD_COMPILER)
           #pragma vector aligned
           #pragma simd
           #endif
           for (int n = iifrom * 8; n < sto; n++)
             f_scalar[n] += f_scalar2[n];
           f_scalar2 += fst4;
         }
 
         if (vflag==2) {
           const ATOM_T * _noalias const xo = x + minlocal;
           #if defined(LMP_SIMD_COMPILER)
           #pragma novector
           #endif
           for (int n = iifrom; n < iito; n++) {
             const int nt2 = n * 2;
             ov0 += f_start[nt2].x * xo[n].x;
             ov1 += f_start[nt2].y * xo[n].y;
             ov2 += f_start[nt2].z * xo[n].z;
             ov3 += f_start[nt2].y * xo[n].x;
             ov4 += f_start[nt2].z * xo[n].x;
             ov5 += f_start[nt2].z * xo[n].y;
           }
         }
       }
 
       if (ierror)
         f_start[1].w = ierror;
     } // omp
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
       ev_global[0] = oevdwl;
       ev_global[1] = (acc_t)0.0;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)-0.5;
         ov1 *= (acc_t)-0.5;
         ov2 *= (acc_t)-0.5;
         ov3 *= (acc_t)-0.5;
         ov4 *= (acc_t)-0.5;
         ov5 *= (acc_t)-0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
 
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // offload
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
   else
     fix->add_result_array(f_start, 0, offload, 0, 0, 2);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairGayBerneIntel::init_style()
 {
   PairGayBerne::init_style();
   if (force->newton_pair == 0) {
     neighbor->requests[neighbor->nrequest-1]->half = 0;
     neighbor->requests[neighbor->nrequest-1]->full = 1;
   }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   if (force->newton_pair) fix->set_offload_noghost(1);
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
                                          IntelBuffers<flt_t,acc_t> *buffers)
 {
   int tp1 = atom->ntypes + 1;
   _max_nbors = buffers->get_max_nbors();
   int mthreads = comm->nthreads;
   if (mthreads < buffers->get_off_threads())
     mthreads = buffers->get_off_threads();
   fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_lj[0] = 1.0;
   }
   fc.gamma = gamma;
   fc.upsilon = upsilon;
   fc.mu = mu;
 
-  for (int i = 0; i < tp1; i++) {
-    for (int j = 0; j < tp1; j++) {
+  for (int i = 1; i < tp1; i++) {
+    for (int j = 1; j < tp1; j++) {
       fc.ijc[i][j].lj1 = lj1[i][j];
       fc.ijc[i][j].lj2 = lj2[i][j];
       fc.ijc[i][j].cutsq = cutsq[i][j];
       fc.ijc[i][j].offset = offset[i][j];
       fc.ijc[i][j].sigma = sigma[i][j];
       fc.ijc[i][j].epsilon = epsilon[i][j];
       fc.ijc[i][j].form = form[i][j];
       fc.ijc[i][j].lshape = lshape[i] * lshape[j];
       fc.lj34[i][j].lj3 = lj3[i][j];
       fc.lj34[i][j].lj4 = lj4[i][j];
     }
     for (int j = 0; j < 4; j++) {
       fc.ic[i].shape2[j] = shape2[i][j];
       fc.ic[i].well[j] = well[i][j];
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   flt_t * special_lj = fc.special_lj;
   FC_PACKED1_T *oijc = fc.ijc[0];
   FC_PACKED2_T *olj34 = fc.lj34[0];
   FC_PACKED3_T *oic = fc.ic;
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   if (oijc != NULL && oic != NULL) {
     #pragma offload_transfer target(mic:_cop) \
       in(special_lj: length(4) alloc_if(0) free_if(0)) \
       in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \
       in(oic: length(tp1) alloc_if(0) free_if(0)) \
       in(ocutneighsq: length(tp1sq))
   }
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                       const int one_length,
                                                       const int nthreads,
                                                       Memory *memory,
                                                       const int cop) {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
       fc_packed3 *oic = ic;
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       fc_packed1 *oijc = ijc[0];
       fc_packed2 *olj34 = lj34[0];
       flt_t * orsq_form = rsq_form[0];
       flt_t * odelx_form = delx_form[0];
       flt_t * odely_form = dely_form[0];
       flt_t * odelz_form = delz_form[0];
       int * ojtype_form = jtype_form[0];
       int * ojlist_form = jlist_form[0];
 
       if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
           orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
           odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
           _cop >= 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
           nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
           nocopy(odelz_form, ojtype_form, ojlist_form: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(oic);
       _memory->destroy(ijc);
       _memory->destroy(lj34);
       _memory->destroy(rsq_form);
       _memory->destroy(delx_form);
       _memory->destroy(dely_form);
       _memory->destroy(delz_form);
       _memory->destroy(jtype_form);
       _memory->destroy(jlist_form);
     }
 
     if (ntypes > 0) {
       _cop = cop;
       memory->create(ijc, ntypes, ntypes, "fc.ijc");
       memory->create(lj34, ntypes, ntypes, "fc.lj34");
       memory->create(ic, ntypes, "fc.ic");
       memory->create(rsq_form, nthreads, one_length, "rsq_form");
       memory->create(delx_form, nthreads, one_length, "delx_form");
       memory->create(dely_form, nthreads, one_length, "dely_form");
       memory->create(delz_form, nthreads, one_length, "delz_form");
       memory->create(jtype_form, nthreads, one_length, "jtype_form");
       memory->create(jlist_form, nthreads, one_length, "jlist_form");
 
       for (int zn = 0; zn < nthreads; zn++)
         for (int zo = 0; zo < one_length; zo++) {
           rsq_form[zn][zo] = 10.0;
           delx_form[zn][zo] = 10.0;
           dely_form[zn][zo] = 10.0;
           delz_form[zn][zo] = 10.0;
           jtype_form[zn][zo] = 1;
           jlist_form[zn][zo] = 0;
         }
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       fc_packed1 *oijc = ijc[0];
       fc_packed2 *olj34 = lj34[0];
       fc_packed3 *oic = ic;
       flt_t * orsq_form = rsq_form[0];
       flt_t * odelx_form = delx_form[0];
       flt_t * odely_form = dely_form[0];
       flt_t * odelz_form = delz_form[0];
       int * ojtype_form = jtype_form[0];
       int * ojlist_form = jlist_form[0];
       int off_onel = one_length * nthreads;
 
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
           oic != NULL && orsq_form != NULL && odelx_form != NULL &&
           odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
           ojlist_form !=NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oic: length(ntypes) alloc_if(1) free_if(0)) \
           in(orsq_form: length(off_onel) alloc_if(1) free_if(0)) \
           in(odelx_form: length(off_onel) alloc_if(1) free_if(0)) \
           in(odely_form: length(off_onel) alloc_if(1) free_if(0)) \
           in(odelz_form: length(off_onel) alloc_if(1) free_if(0)) \
           in(ojtype_form: length(off_onel) alloc_if(1) free_if(0)) \
           in(ojlist_form: length(off_onel) alloc_if(1) free_if(0))
       }
       #endif
     }
   }
   _ntypes = ntypes;
   _memory = memory;
 }
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
index 0dc2c275e..e3afcd64a 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@@ -1,595 +1,595 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    This software is distributed under the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include "pair_lj_charmm_coul_charmm_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "group.h"
 #include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "memory.h"
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
 
 /* ---------------------------------------------------------------------- */
 
 PairLJCharmmCoulCharmmIntel::PairLJCharmmCoulCharmmIntel(LAMMPS *lmp) :
   PairLJCharmmCoulCharmm(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairLJCharmmCoulCharmmIntel::~PairLJCharmmCoulCharmmIntel()
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag,
                                         IntelBuffers<flt_t,acc_t> *buffers,
                                         const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
     #if defined(_OPENMP)
     #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
                                 packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
   // -------------------- Regular version
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
   if (eflag) {
     if (force->newton_pair) {
       eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
       eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
                                      IntelBuffers<flt_t,acc_t> *buffers,
                                      const ForceConst<flt_t> &fc,
                                      const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
   const flt_t qqrd2e = force->qqrd2e;
   const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;
   const flt_t inv_denom_coul = (flt_t)1.0/denom_coul;
 
   const flt_t * _noalias const cutsq = fc.cutsq[0];
   const LJ_T * _noalias const lj = fc.lj[0];
   const flt_t cut_ljsq = fc.cut_ljsq;
   const flt_t cut_lj_innersq = fc.cut_lj_innersq;
   const flt_t cut_coul_innersq = fc.cut_coul_innersq;
   const flt_t cut_coulsq = fc.cut_coulsq;
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   flt_t * _noalias const ccachex = buffers->get_ccachex();
   flt_t * _noalias const ccachey = buffers->get_ccachey();
   flt_t * _noalias const ccachez = buffers->get_ccachez();
   flt_t * _noalias const ccachew = buffers->get_ccachew();
   int * _noalias const ccachei = buffers->get_ccachei();
   int * _noalias const ccachej = buffers->get_ccachej();
   const int ccache_stride = _ccache_stride;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload) \
     in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
     in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
     in(ccache_stride,nthreads,qqrd2e,inum,nall,ntypes,cut_coulsq) \
     in(vflag,eatom,f_stride,separate_flag,offload) \
     in(astart,cut_ljsq,cut_lj_innersq,nlocal,inv_denom_lj,minlocal) \
     in(inv_denom_coul,cut_coul_innersq) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
                               f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = oecoul = (acc_t)0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       int foff;
       if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
       else foff = -minlocal;
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
       flt_t cutboth = cut_coulsq;
 
       const int toffs = tid * ccache_stride;
       flt_t * _noalias const tdelx = ccachex + toffs;
       flt_t * _noalias const tdely = ccachey + toffs;
       flt_t * _noalias const tdelz = ccachez + toffs;
       flt_t * _noalias const trsq = ccachew + toffs;
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
 
       for (int i = iifrom; i < iito; i += iip) {
         //        const int i = ilist[ii];
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const flt_t * _noalias const cutsqi = cutsq + ptr_off;
         const LJ_T * _noalias const lji = lj + ptr_off;
 
         const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
         if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           const int j = jlist[jj] & NEIGHMASK;
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
           if (rsq < cut_coulsq) {
             trsq[ej]=rsq;
             tdelx[ej]=delx;
             tdely[ej]=dely;
             tdelz[ej]=delz;
             tjtype[ej]=x[j].w;
             tj[ej]=jlist[jj];
             ej++;
           }
         }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl;
           forcecoul = forcelj = evdwl = (flt_t)0.0;
 
           const int j = tj[jj] & NEIGHMASK;
           const int sbindex = tj[jj] >> SBBITS & 3;
           const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
 	  const flt_t r_inv = (flt_t)1.0 / sqrt(rsq);
 	  forcecoul = qqrd2e * qtmp * q[j] * r_inv;
 	  if (rsq > cut_coul_innersq) {
 	    const flt_t ccr = cut_coulsq - rsq;
 	    const flt_t switch1 = ccr * ccr * inv_denom_coul *
               (cut_coulsq + (flt_t)2.0 * rsq - (flt_t)3.0 * cut_coul_innersq);
             forcecoul *= switch1; 
           }
 
           #ifdef INTEL_VMASK
           if (rsq < cut_ljsq) {
           #endif
 	    const int jtype = tjtype[jj];
             flt_t r6inv = r2inv * r2inv * r2inv;
             forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
             if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
 
             #ifdef INTEL_VMASK
             if (rsq > cut_lj_innersq) {
             #endif
               const flt_t drsq = cut_ljsq - rsq;
               const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
               const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
                   inv_denom_lj;
               const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
               if (EFLAG) {
                 #ifndef INTEL_VMASK
                 if (rsq > cut_lj_innersq) {
                 #endif
                   forcelj = forcelj * switch1 + evdwl * switch2;
                   evdwl *= switch1;
                 #ifndef INTEL_VMASK
                 }
                 #endif
               } else {
                 const flt_t philj = r6inv * (lji[jtype].z*r6inv -
                     lji[jtype].w);
                 #ifndef INTEL_VMASK
                 if (rsq > cut_lj_innersq)
                 #endif
                   forcelj =  forcelj * switch1 + philj * switch2;
               }
             #ifdef INTEL_VMASK
             }
             #endif
 
           #ifdef INTEL_VMASK
           }
           #else
           if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
           #endif
 	  if (sbindex) {
   	    const flt_t factor_coul = special_coul[sbindex];
 	    forcecoul *= factor_coul;
 	    const flt_t factor_lj = special_lj[sbindex];
 	    forcelj *= factor_lj;
 	    if (EFLAG) evdwl *= factor_lj;
           }
 
           const flt_t fpair = (forcecoul + forcelj) * r2inv;
           const flt_t fpx = fpair * tdelx[jj];
           fxtmp += fpx;
           if (NEWTON_PAIR) f[j].x -= fpx;
           const flt_t fpy = fpair * tdely[jj];
           fytmp += fpy;
           if (NEWTON_PAIR) f[j].y -= fpy;
           const flt_t fpz = fpair * tdelz[jj];
           fztmp += fpz;
           if (NEWTON_PAIR) f[j].z -= fpz;
 
           if (EFLAG) {
             sevdwl += evdwl;
             secoul += forcecoul;
             if (eatom) {
               fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
               if (NEWTON_PAIR)
                 f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
             }
           }
           if (NEWTON_PAIR == 0)
             IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
                                   fpx, fpy, fpz);
         } // for jj
         if (NEWTON_PAIR) {
           f[i].x += fxtmp;
           f[i].y += fytmp;
           f[i].z += fztmp;
         } else {
           f[i].x = fxtmp;
           f[i].y = fytmp;
           f[i].z = fztmp;
         }
         IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
                               f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
                               ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) {
         oevdwl *= (acc_t)0.5;
         oecoul *= (acc_t)0.5;
       }
       ev_global[0] = oevdwl;
       ev_global[1] = oecoul;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
         ov3 *= (acc_t)0.5;
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end of offload region
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairLJCharmmCoulCharmmIntel::init_style()
 {
   PairLJCharmmCoulCharmm::init_style();
   if (force->newton_pair == 0) {
     neighbor->requests[neighbor->nrequest-1]->half = 0;
     neighbor->requests[neighbor->nrequest-1]->full = 1;
   }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 template <class flt_t, class acc_t>
 void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
   int off_ccache = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop >= 0) off_ccache = 1;
   #endif
   buffers->grow_ccache(off_ccache, comm->nthreads, 1);
   _ccache_stride = buffers->ccache_stride();
 
   int tp1 = atom->ntypes + 1;
 
   fc.set_ntypes(tp1, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   if (cut_lj > cut_coul)
     error->all(FLERR,
          "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i, j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   cut_coul_innersq = cut_coul_inner * cut_coul_inner;
   cut_lj_innersq = cut_lj_inner * cut_lj_inner;
   cut_ljsq = cut_lj * cut_lj;
   cut_coulsq = cut_coul * cut_coul;
   cut_bothsq = MAX(cut_ljsq, cut_coulsq);
 
   fc.cut_coulsq = cut_coulsq;
   fc.cut_ljsq = cut_ljsq;
   fc.cut_coul_innersq = cut_coul_innersq;
   fc.cut_lj_innersq = cut_lj_innersq;
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_coul[i] = force->special_coul[i];
     fc.special_coul[0] = 1.0;
     fc.special_lj[0] = 1.0;
   }
 
-  for (int i = 0; i < tp1; i++) {
-    for (int j = 0; j < tp1; j++) {
+  for (int i = 1; i < tp1; i++) {
+    for (int j = 1; j < tp1; j++) {
       fc.lj[i][j].x = lj1[i][j];
       fc.lj[i][j].y = lj2[i][j];
       fc.lj[i][j].z = lj3[i][j];
       fc.lj[i][j].w = lj4[i][j];
       fc.cutsq[i][j] = cutsq[i][j];
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   flt_t * special_lj = fc.special_lj;
   flt_t * special_coul = fc.special_coul;
   flt_t * cutsq = fc.cutsq[0];
   LJ_T * lj = fc.lj[0];
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
     in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
     in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairLJCharmmCoulCharmmIntel::ForceConst<flt_t>::set_ntypes(
   const int ntypes, Memory *memory, const int cop) {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       flt_t * ocutsq = cutsq[0];
       typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
       if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
           ospecial_coul != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
           nocopy(ocutsq, olj: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(cutsq);
       _memory->destroy(lj);
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(cutsq,ntypes,ntypes,"fc.cutsq");
       memory->create(lj,ntypes,ntypes,"fc.lj");
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       flt_t * ocutsq = cutsq[0];
       typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
           ospecial_coul != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
           nocopy(ocutsq,olj: length(tp1sq) alloc_if(1) free_if(0))
       }
       #endif
     }
   }
   _ntypes=ntypes;
   _memory=memory;
 }
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
index fe9952512..a2680cdff 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -1,712 +1,712 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    This software is distributed under the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include "pair_lj_charmm_coul_long_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "group.h"
 #include "kspace.h"
 #include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "memory.h"
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
 #define TABLE_T typename ForceConst<flt_t>::table_t
 
 /* ---------------------------------------------------------------------- */
 
 PairLJCharmmCoulLongIntel::PairLJCharmmCoulLongIntel(LAMMPS *lmp) :
   PairLJCharmmCoulLong(lmp)
 {
   suffix_flag |= Suffix::INTEL;
   respa_enable = 0;
   cut_respa = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairLJCharmmCoulLongIntel::~PairLJCharmmCoulLongIntel()
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
                                         IntelBuffers<flt_t,acc_t> *buffers,
                                         const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
     #if defined(_OPENMP)
     #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
                                 packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
   // -------------------- Regular version
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
   if (eflag) {
     if (force->newton_pair) {
       eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
       eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
                                      IntelBuffers<flt_t,acc_t> *buffers,
                                      const ForceConst<flt_t> &fc,
                                      const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
   const flt_t qqrd2e = force->qqrd2e;
   const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;
 
   const flt_t * _noalias const cutsq = fc.cutsq[0];
   const LJ_T * _noalias const lj = fc.lj[0];
   const TABLE_T * _noalias const table = fc.table;
   const flt_t * _noalias const etable = fc.etable;
   const flt_t * _noalias const detable = fc.detable;
   const flt_t * _noalias const ctable = fc.ctable;
   const flt_t * _noalias const dctable = fc.dctable;
   const flt_t cut_ljsq = fc.cut_ljsq;
   const flt_t cut_lj_innersq = fc.cut_lj_innersq;
   const flt_t cut_coulsq = fc.cut_coulsq;
   const flt_t g_ewald = fc.g_ewald;
   const flt_t tabinnersq = fc.tabinnersq;
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   flt_t * _noalias const ccachex = buffers->get_ccachex();
   flt_t * _noalias const ccachey = buffers->get_ccachey();
   flt_t * _noalias const ccachez = buffers->get_ccachez();
   flt_t * _noalias const ccachew = buffers->get_ccachew();
   int * _noalias const ccachei = buffers->get_ccachei();
   int * _noalias const ccachej = buffers->get_ccachej();
   const int ccache_stride = _ccache_stride;
 
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
   // Redeclare as local variables for offload
   const int ncoultablebits = this->ncoultablebits;
   const int ncoulmask = this->ncoulmask;
   const int ncoulshiftbits = this->ncoulshiftbits;
   #ifdef INTEL_ALLOW_TABLE
   #define ITABLE_IN in(table,etable,detable:length(0) alloc_if(0) free_if(0)) \
                     in(ctable,dctable:length(0) alloc_if(0) free_if(0)) \
                     in(ncoultablebits,tabinnersq,ncoulmask,ncoulshiftbits)
   #else
   #define ITABLE_IN
   #endif
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload) \
     in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
     in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
     in(ccache_stride,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,cut_coulsq) \
     in(vflag,eatom,f_stride,separate_flag,offload) \
     in(astart,cut_ljsq,cut_lj_innersq,nlocal,inv_denom_lj,minlocal) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     ITABLE_IN signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
                               f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = oecoul = (acc_t)0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       int foff;
       if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
       else foff = -minlocal;
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
       flt_t cutboth = cut_coulsq;
 
       const int toffs = tid * ccache_stride;
       flt_t * _noalias const tdelx = ccachex + toffs;
       flt_t * _noalias const tdely = ccachey + toffs;
       flt_t * _noalias const tdelz = ccachez + toffs;
       flt_t * _noalias const trsq = ccachew + toffs;
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
 
       for (int i = iifrom; i < iito; i += iip) {
         //        const int i = ilist[ii];
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const flt_t * _noalias const cutsqi = cutsq + ptr_off;
         const LJ_T * _noalias const lji = lj + ptr_off;
 
         const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
         if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           const int j = jlist[jj] & NEIGHMASK;
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
           if (rsq < cut_coulsq) {
             trsq[ej]=rsq;
             tdelx[ej]=delx;
             tdely[ej]=dely;
             tdelz[ej]=delz;
             tjtype[ej]=x[j].w;
             tj[ej]=jlist[jj];
             ej++;
           }
         }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl, ecoul;
           forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
 
           const int j = tj[jj] & NEIGHMASK;
           const int sbindex = tj[jj] >> SBBITS & 3;
           const int jtype = tjtype[jj];
           const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
           #ifdef INTEL_ALLOW_TABLE
           if (!ncoultablebits || rsq <= tabinnersq) {
           #endif
             const flt_t A1 =  0.254829592;
             const flt_t A2 = -0.284496736;
             const flt_t A3 =  1.421413741;
             const flt_t A4 = -1.453152027;
             const flt_t A5 =  1.061405429;
             const flt_t EWALD_F = 1.12837917;
             const flt_t INV_EWALD_P = 1.0 / 0.3275911;
 
             const flt_t r = (flt_t)1.0 / sqrt(r2inv);
             const flt_t grij = g_ewald * r;
             const flt_t expm2 = exp(-grij * grij);
             const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
             const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
             const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
             forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
             if (EFLAG) ecoul = prefactor * erfc;
 
             const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
               prefactor;
             forcecoul -= adjust;
             if (EFLAG) ecoul -= adjust;
 
           #ifdef INTEL_ALLOW_TABLE
           } else {
             float rsq_lookup = rsq;
             const int itable = (__intel_castf32_u32(rsq_lookup) &
                                 ncoulmask) >> ncoulshiftbits;
             const flt_t fraction = (rsq_lookup - table[itable].r) *
               table[itable].dr;
 
             const flt_t tablet = table[itable].f +
               fraction * table[itable].df;
             forcecoul = qtmp * q[j] * tablet;
             if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
                                fraction * detable[itable]);
             if (sbindex) {
               const flt_t table2 = ctable[itable] +
                 fraction * dctable[itable];
               const flt_t prefactor = qtmp * q[j] * table2;
               const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
                 prefactor;
               forcecoul -= adjust;
               if (EFLAG) ecoul -= adjust;
             }
           }
           #endif
 
           #ifdef INTEL_VMASK
           if (rsq < cut_ljsq) {
           #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
             forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
             if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
 
             #ifdef INTEL_VMASK
             if (rsq > cut_lj_innersq) {
             #endif
               const flt_t drsq = cut_ljsq - rsq;
               const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
               const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
                   inv_denom_lj;
               const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
               if (EFLAG) {
                 #ifndef INTEL_VMASK
                 if (rsq > cut_lj_innersq) {
                 #endif
                   forcelj = forcelj * switch1 + evdwl * switch2;
                   evdwl *= switch1;
                 #ifndef INTEL_VMASK
                 }
                 #endif
               } else {
                 const flt_t philj = r6inv * (lji[jtype].z*r6inv -
                     lji[jtype].w);
                 #ifndef INTEL_VMASK
                 if (rsq > cut_lj_innersq)
                 #endif
                   forcelj =  forcelj * switch1 + philj * switch2;
               }
             #ifdef INTEL_VMASK
             }
             #endif
 
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcelj *= factor_lj;
               if (EFLAG) evdwl *= factor_lj;
             }
           #ifdef INTEL_VMASK
           }
           #else
           if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
           #endif
 
           const flt_t fpair = (forcecoul + forcelj) * r2inv;
           const flt_t fpx = fpair * tdelx[jj];
           fxtmp += fpx;
           if (NEWTON_PAIR) f[j].x -= fpx;
           const flt_t fpy = fpair * tdely[jj];
           fytmp += fpy;
           if (NEWTON_PAIR) f[j].y -= fpy;
           const flt_t fpz = fpair * tdelz[jj];
           fztmp += fpz;
           if (NEWTON_PAIR) f[j].z -= fpz;
 
           if (EFLAG) {
             sevdwl += evdwl;
             secoul += ecoul;
             if (eatom) {
               fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
               if (NEWTON_PAIR)
                 f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
             }
           }
           if (NEWTON_PAIR == 0)
             IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
                                   fpx, fpy, fpz);
         } // for jj
         if (NEWTON_PAIR) {
           f[i].x += fxtmp;
           f[i].y += fytmp;
           f[i].z += fztmp;
         } else {
           f[i].x = fxtmp;
           f[i].y = fytmp;
           f[i].z = fztmp;
         }
         IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
                               f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
                               ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) {
         oevdwl *= (acc_t)0.5;
         oecoul *= (acc_t)0.5;
       }
       ev_global[0] = oevdwl;
       ev_global[1] = oecoul;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
         ov3 *= (acc_t)0.5;
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end of offload region
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairLJCharmmCoulLongIntel::init_style()
 {
   PairLJCharmmCoulLong::init_style();
   if (force->newton_pair == 0) {
     neighbor->requests[neighbor->nrequest-1]->half = 0;
     neighbor->requests[neighbor->nrequest-1]->full = 1;
   }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 
   _lrt = fix->lrt();
 }
 
 template <class flt_t, class acc_t>
 void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
   int off_ccache = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop >= 0) off_ccache = 1;
   #endif
   buffers->grow_ccache(off_ccache, comm->nthreads, 1);
   _ccache_stride = buffers->ccache_stride();
 
   int tp1 = atom->ntypes + 1;
   int ntable = 1;
   if (ncoultablebits)
     for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
 
   fc.set_ntypes(tp1, ntable, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   if (cut_lj > cut_coul)
     error->all(FLERR,
          "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i, j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   cut_lj_innersq = cut_lj_inner * cut_lj_inner;
   cut_ljsq = cut_lj * cut_lj;
   cut_coulsq = cut_coul * cut_coul;
   cut_bothsq = MAX(cut_ljsq, cut_coulsq);
 
   fc.g_ewald = force->kspace->g_ewald;
   fc.tabinnersq = tabinnersq;
   fc.cut_coulsq = cut_coulsq;
   fc.cut_ljsq = cut_ljsq;
   fc.cut_lj_innersq = cut_lj_innersq;
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_coul[i] = force->special_coul[i];
     fc.special_coul[0] = 1.0;
     fc.special_lj[0] = 1.0;
   }
 
-  for (int i = 0; i < tp1; i++) {
-    for (int j = 0; j < tp1; j++) {
+  for (int i = 1; i < tp1; i++) {
+    for (int j = 1; j < tp1; j++) {
       fc.lj[i][j].x = lj1[i][j];
       fc.lj[i][j].y = lj2[i][j];
       fc.lj[i][j].z = lj3[i][j];
       fc.lj[i][j].w = lj4[i][j];
       fc.cutsq[i][j] = cutsq[i][j];
     }
   }
 
   if (ncoultablebits) {
     for (int i = 0; i < ntable; i++) {
       fc.table[i].r = rtable[i];
       fc.table[i].dr = drtable[i];
       fc.table[i].f = ftable[i];
       fc.table[i].df = dftable[i];
       fc.etable[i] = etable[i];
       fc.detable[i] = detable[i];
       fc.ctable[i] = ctable[i];
       fc.dctable[i] = dctable[i];
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   flt_t * special_lj = fc.special_lj;
   flt_t * special_coul = fc.special_coul;
   flt_t * cutsq = fc.cutsq[0];
   LJ_T * lj = fc.lj[0];
   TABLE_T * table = fc.table;
   flt_t * etable = fc.etable;
   flt_t * detable = fc.detable;
   flt_t * ctable = fc.ctable;
   flt_t * dctable = fc.dctable;
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
     in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
     in(table: length(ntable) alloc_if(0) free_if(0)) \
     in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
     in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                               const int ntable,
                                                               Memory *memory,
                                                               const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       flt_t * ocutsq = cutsq[0];
       typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
       table_t * otable = table;
       flt_t * oetable = etable;
       flt_t * odetable = detable;
       flt_t * octable = ctable;
       flt_t * odctable = dctable;
       if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
           otable != NULL && oetable != NULL && odetable != NULL &&
           octable != NULL && odctable != NULL && ospecial_coul != NULL &&
           cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
           nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \
           nocopy(otable: alloc_if(0) free_if(1)) \
           nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(cutsq);
       _memory->destroy(lj);
       _memory->destroy(table);
       _memory->destroy(etable);
       _memory->destroy(detable);
       _memory->destroy(ctable);
       _memory->destroy(dctable);
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(cutsq,ntypes,ntypes,"fc.cutsq");
       memory->create(lj,ntypes,ntypes,"fc.lj");
       memory->create(table,ntable,"pair:fc.table");
       memory->create(etable,ntable,"pair:fc.etable");
       memory->create(detable,ntable,"pair:fc.detable");
       memory->create(ctable,ntable,"pair:fc.ctable");
       memory->create(dctable,ntable,"pair:fc.dctable");
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       flt_t * ocutsq = cutsq[0];
       typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
       table_t * otable = table;
       flt_t * oetable = etable;
       flt_t * odetable = detable;
       flt_t * octable = ctable;
       flt_t * odctable = dctable;
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
           otable !=NULL && oetable != NULL && odetable != NULL &&
           octable != NULL && odctable != NULL && ospecial_coul != NULL &&
           cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
           nocopy(ocutsq,olj: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(otable: length(ntable) alloc_if(1) free_if(0)) \
           nocopy(oetable,odetable: length(ntable) alloc_if(1) free_if(0)) \
           nocopy(octable,odctable: length(ntable) alloc_if(1) free_if(0))
       }
       #endif
     }
   }
   _ntypes=ntypes;
   _ntable=ntable;
   _memory=memory;
 }
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
index e9775d6ec..c0bf6f35c 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -1,676 +1,676 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    This software is distributed under the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include "pair_lj_cut_coul_long_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "group.h"
 #include "kspace.h"
 #include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "memory.h"
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define C_FORCE_T typename ForceConst<flt_t>::c_force_t
 #define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
 #define TABLE_T typename ForceConst<flt_t>::table_t
 
 /* ---------------------------------------------------------------------- */
 
 PairLJCutCoulLongIntel::PairLJCutCoulLongIntel(LAMMPS *lmp) :
   PairLJCutCoulLong(lmp)
 {
   suffix_flag |= Suffix::INTEL;
   respa_enable = 0;
   cut_respa = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairLJCutCoulLongIntel::~PairLJCutCoulLongIntel()
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairLJCutCoulLongIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
                                      IntelBuffers<flt_t,acc_t> *buffers,
                                      const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
     #if defined(_OPENMP)
     #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
                                 packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
   if (eflag) {
     if (force->newton_pair) {
       eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
       eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
       eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
       eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
                                   IntelBuffers<flt_t,acc_t> *buffers,
                                   const ForceConst<flt_t> &fc,
                                   const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
   const flt_t qqrd2e = force->qqrd2e;
 
   const C_FORCE_T * _noalias const c_force = fc.c_force[0];
   const C_ENERGY_T * _noalias const c_energy = fc.c_energy[0];
   const TABLE_T * _noalias const table = fc.table;
   const flt_t * _noalias const etable = fc.etable;
   const flt_t * _noalias const detable = fc.detable;
   const flt_t * _noalias const ctable = fc.ctable;
   const flt_t * _noalias const dctable = fc.dctable;
   const flt_t g_ewald = fc.g_ewald;
   const flt_t tabinnersq = fc.tabinnersq;
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   flt_t * _noalias const ccachex = buffers->get_ccachex();
   flt_t * _noalias const ccachey = buffers->get_ccachey();
   flt_t * _noalias const ccachez = buffers->get_ccachez();
   flt_t * _noalias const ccachew = buffers->get_ccachew();
   int * _noalias const ccachei = buffers->get_ccachei();
   int * _noalias const ccachej = buffers->get_ccachej();
   const int ccache_stride = _ccache_stride;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
 
   const int nthreads = tc;
   #ifdef _LMP_INTEL_OFFLOAD
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
   // Redeclare as local variables for offload
   const int ncoultablebits = this->ncoultablebits;
   const int ncoulmask = this->ncoulmask;
   const int ncoulshiftbits = this->ncoulshiftbits;
   #ifdef INTEL_ALLOW_TABLE
   #define ITABLE_IN in(table,etable,detable:length(0) alloc_if(0) free_if(0)) \
                     in(ctable,dctable:length(0) alloc_if(0) free_if(0)) \
                     in(ncoultablebits,tabinnersq,ncoulmask,ncoulshiftbits)
   #else
   #define ITABLE_IN
   #endif
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload) \
     in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
     in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
     in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload)    \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     ITABLE_IN signal(f_start)
   #endif
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
                               f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = oecoul = (acc_t)0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       int foff;
       if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
       else foff = -minlocal;
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       const int toffs = tid * ccache_stride;
       flt_t * _noalias const tdelx = ccachex + toffs;
       flt_t * _noalias const tdely = ccachey + toffs;
       flt_t * _noalias const tdelz = ccachez + toffs;
       flt_t * _noalias const trsq = ccachew + toffs;
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
 
       for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
 
         const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
         if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           const int j = jlist[jj] & NEIGHMASK;
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
           if (rsq < c_forcei[jtype].cutsq) {
             trsq[ej]=rsq;
             tdelx[ej]=delx;
             tdely[ej]=dely;
             tdelz[ej]=delz;
             tjtype[ej]=jtype;
             tj[ej]=jlist[jj];
             ej++;
           }
         }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
                                  sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl, ecoul;
           forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
 
           const int j = tj[jj] & NEIGHMASK;
           const int sbindex = tj[jj] >> SBBITS & 3;
           const int jtype = tjtype[jj];
           const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
           #ifdef INTEL_ALLOW_TABLE
           if (!ncoultablebits || rsq <= tabinnersq) {
           #endif
             const flt_t A1 =  0.254829592;
             const flt_t A2 = -0.284496736;
             const flt_t A3 =  1.421413741;
             const flt_t A4 = -1.453152027;
             const flt_t A5 =  1.061405429;
             const flt_t EWALD_F = 1.12837917;
             const flt_t INV_EWALD_P = 1.0 / 0.3275911;
 
             const flt_t r = (flt_t)1.0 / sqrt(r2inv);
             const flt_t grij = g_ewald * r;
             const flt_t expm2 = exp(-grij * grij);
             const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
             const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
             const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
             forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
             if (EFLAG) ecoul = prefactor * erfc;
 
             const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
               prefactor;
             forcecoul -= adjust;
             if (EFLAG) ecoul -= adjust;
 
           #ifdef INTEL_ALLOW_TABLE
           } else {
             float rsq_lookup = rsq;
             const int itable = (__intel_castf32_u32(rsq_lookup) &
                                 ncoulmask) >> ncoulshiftbits;
             const flt_t fraction = (rsq_lookup - table[itable].r) *
               table[itable].dr;
 
             const flt_t tablet = table[itable].f +
               fraction * table[itable].df;
             forcecoul = qtmp * q[j] * tablet;
             if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
                                               fraction * detable[itable]);
             if (sbindex) {
               const flt_t table2 = ctable[itable] +
                 fraction * dctable[itable];
               const flt_t prefactor = qtmp * q[j] * table2;
               const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
                 prefactor;
               forcecoul -= adjust;
               if (EFLAG) ecoul -= adjust;
             }
           }
           #endif
 
           #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cut_ljsq) {
           #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
             forcelj = r6inv * (c_forcei[jtype].lj1 * r6inv -
                                c_forcei[jtype].lj2);
             if (EFLAG) evdwl = r6inv*(c_energyi[jtype].lj3 * r6inv -
                                       c_energyi[jtype].lj4) -
                                c_energyi[jtype].offset;
 
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcelj *= factor_lj;
               if (EFLAG) evdwl *= factor_lj;
             }
           #ifdef INTEL_VMASK
           }
           #else
           if (rsq > c_forcei[jtype].cut_ljsq)
             { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
           #endif
 
           const flt_t fpair = (forcecoul + forcelj) * r2inv;
           const flt_t fpx = fpair * tdelx[jj];
           fxtmp += fpx;
           if (NEWTON_PAIR) f[j].x -= fpx;
           const flt_t fpy = fpair * tdely[jj];
           fytmp += fpy;
           if (NEWTON_PAIR) f[j].y -= fpy;
           const flt_t fpz = fpair * tdelz[jj];
           fztmp += fpz;
           if (NEWTON_PAIR) f[j].z -= fpz;
 
           if (EFLAG) {
             sevdwl += evdwl;
             secoul += ecoul;
             if (eatom) {
               fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
               if (NEWTON_PAIR)
                 f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
             }
           }
           if (NEWTON_PAIR == 0)
             IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
                                   fpx, fpy, fpz);
         } // for jj
 
         if (NEWTON_PAIR) {
           f[i].x += fxtmp;
           f[i].y += fytmp;
           f[i].z += fztmp;
         } else {
           f[i].x = fxtmp;
           f[i].y = fytmp;
           f[i].z = fztmp;
         }
 
         IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
                               f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
                               ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) {
         oevdwl *= (acc_t)0.5;
         oecoul *= (acc_t)0.5;
       }
       ev_global[0] = oevdwl;
       ev_global[1] = oecoul;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
         ov3 *= (acc_t)0.5;
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end of offload region
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairLJCutCoulLongIntel::init_style()
 {
   PairLJCutCoulLong::init_style();
   if (force->newton_pair == 0) {
     neighbor->requests[neighbor->nrequest-1]->half = 0;
     neighbor->requests[neighbor->nrequest-1]->full = 1;
   }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 
   _lrt = fix->lrt();
 }
 
 template <class flt_t, class acc_t>
 void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
   int off_ccache = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop >= 0) off_ccache = 1;
   #endif
   buffers->grow_ccache(off_ccache, comm->nthreads, 1);
   _ccache_stride = buffers->ccache_stride();
 
   int tp1 = atom->ntypes + 1;
   int ntable = 1;
   if (ncoultablebits)
     for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
 
   fc.set_ntypes(tp1, ntable, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i, j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   fc.g_ewald = force->kspace->g_ewald;
   fc.tabinnersq = tabinnersq;
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_coul[i] = force->special_coul[i];
     fc.special_coul[0] = 1.0;
     fc.special_lj[0] = 1.0;
   }
 
-  for (int i = 0; i < tp1; i++) {
-    for (int j = 0; j < tp1; j++) {
+  for (int i = 1; i < tp1; i++) {
+    for (int j = 1; j < tp1; j++) {
       if (cutsq[i][j] < cut_ljsq[i][j])
         error->all(FLERR,
          "Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic");
       fc.c_force[i][j].cutsq = cutsq[i][j];
       fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
       fc.c_force[i][j].lj1 = lj1[i][j];
       fc.c_force[i][j].lj2 = lj2[i][j];
       fc.c_energy[i][j].lj3 = lj3[i][j];
       fc.c_energy[i][j].lj4 = lj4[i][j];
       fc.c_energy[i][j].offset = offset[i][j];
     }
   }
 
   if (ncoultablebits) {
     for (int i = 0; i < ntable; i++) {
       fc.table[i].r = rtable[i];
       fc.table[i].dr = drtable[i];
       fc.table[i].f = ftable[i];
       fc.table[i].df = dftable[i];
       fc.etable[i] = etable[i];
       fc.detable[i] = detable[i];
       fc.ctable[i] = ctable[i];
       fc.dctable[i] = dctable[i];
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
   flt_t * special_lj = fc.special_lj;
   flt_t * special_coul = fc.special_coul;
   C_FORCE_T * c_force = fc.c_force[0];
   C_ENERGY_T * c_energy = fc.c_energy[0];
   TABLE_T * table = fc.table;
   flt_t * etable = fc.etable;
   flt_t * detable = fc.detable;
   flt_t * ctable = fc.ctable;
   flt_t * dctable = fc.dctable;
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
     in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
     in(table: length(ntable) alloc_if(0) free_if(0)) \
     in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
     in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                            const int ntable,
                                                            Memory *memory,
                                                            const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       table_t * otable = table;
       flt_t * oetable = etable;
       flt_t * odetable = detable;
       flt_t * octable = ctable;
       flt_t * odctable = dctable;
       if (ospecial_lj != NULL && oc_force != NULL &&
           oc_energy != NULL && otable != NULL && oetable != NULL &&
           odetable != NULL && octable != NULL && odctable != NULL &&
           ospecial_coul != NULL && _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
           nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
           nocopy(otable: alloc_if(0) free_if(1)) \
           nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(c_force);
       _memory->destroy(c_energy);
       _memory->destroy(table);
       _memory->destroy(etable);
       _memory->destroy(detable);
       _memory->destroy(ctable);
       _memory->destroy(dctable);
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(c_force,ntypes,ntypes,"fc.c_force");
       memory->create(c_energy,ntypes,ntypes,"fc.c_energy");
       memory->create(table,ntable,"pair:fc.table");
       memory->create(etable,ntable,"pair:fc.etable");
       memory->create(detable,ntable,"pair:fc.detable");
       memory->create(ctable,ntable,"pair:fc.ctable");
       memory->create(dctable,ntable,"pair:fc.dctable");
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       table_t * otable = table;
       flt_t * oetable = etable;
       flt_t * odetable = detable;
       flt_t * octable = ctable;
       flt_t * odctable = dctable;
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oc_force != NULL &&
           oc_energy != NULL && otable !=NULL && oetable != NULL &&
           odetable != NULL && octable != NULL && odctable != NULL &&
           ospecial_coul != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
           nocopy(oc_force: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(oc_energy: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(otable: length(ntable) alloc_if(1) free_if(0)) \
           nocopy(oetable,odetable: length(ntable) alloc_if(1) free_if(0)) \
           nocopy(octable,odctable: length(ntable) alloc_if(1) free_if(0))
       }
       #endif
     }
   }
   _ntypes=ntypes;
   _ntable=ntable;
   _memory=memory;
 }
diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp
index 487182184..f5a7999ee 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@@ -1,474 +1,474 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    This software is distributed under the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
 #include <math.h>
 #include "pair_lj_cut_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
 #include "memory.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
 #define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
 #define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
 
 /* ---------------------------------------------------------------------- */
 
 PairLJCutIntel::PairLJCutIntel(LAMMPS *lmp) :
   PairLJCut(lmp)
 {
   suffix_flag |= Suffix::INTEL;
   respa_enable = 0;
   cut_respa = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairLJCutIntel::compute(int eflag, int vflag)
 {
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
   else
     compute<float,float>(eflag, vflag, fix->get_single_buffers(),
                          force_const_single);
 
   fix->balance_stamp();
   vflag_fdotr = 0;
 }
 
 template <class flt_t, class acc_t>
 void PairLJCutIntel::compute(int eflag, int vflag,
                              IntelBuffers<flt_t,acc_t> *buffers,
                              const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag, vflag);
   } else evflag = vflag_fdotr = 0;
 
   const int inum = list->inum;
   const int nthreads = comm->nthreads;
   const int host_start = fix->host_start_pair();
   const int offload_end = fix->offload_end_pair();
   const int ago = neighbor->ago;
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
     #if defined(_OPENMP)
     #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
                                 packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
   if (_onetype) {
     if (eflag) {
       if (force->newton_pair) {
         eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
         eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
         eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
         eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
         eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
         eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
         eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
         eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
     if (eflag) {
       if (force->newton_pair) {
         eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
         eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
         eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
         eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
         eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
         eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
         eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
         eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   }
 }
 
 template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCutIntel::eval(const int offload, const int vflag,
                           IntelBuffers<flt_t,acc_t> *buffers,
                           const ForceConst<flt_t> &fc,
                           const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
   int nlocal, nall, minlocal;
   fix->get_buffern(offload, nlocal, nall, minlocal);
 
   const int ago = neighbor->ago;
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
 
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
   const flt_t * _noalias const special_lj = fc.special_lj;
   const FC_PACKED1_T * _noalias const ljc12o = fc.ljc12o[0];
   const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
   acc_t * _noalias ev_global;
   IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
   const int nthreads = tc;
   int *overflow = fix->get_off_overflow_flag();
   {
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = (acc_t)0;
     if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iip, iito, tid;
       IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
       int foff;
       if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
       else foff = -minlocal;
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       flt_t cutsq, lj1, lj2, lj3, lj4, offset;
       if (ONETYPE) {
         cutsq = ljc12o[3].cutsq;
         lj1 = ljc12o[3].lj1;
         lj2 = ljc12o[3].lj2;
         lj3 = lj34[3].lj3;
         lj4 = lj34[3].lj4;
         offset = ljc12o[3].offset;
       }
       for (int i = iifrom; i < iito; i += iip) {
         int itype, ptr_off;
         const FC_PACKED1_T * _noalias ljc12oi;
         const FC_PACKED2_T * _noalias lj34i;
         if (!ONETYPE) {
           itype = x[i].w;
           ptr_off = itype * ntypes;
           ljc12oi = ljc12o + ptr_off;
           lj34i = lj34 + ptr_off;
         }
 
         const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
         acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         fxtmp = fytmp = fztmp = (acc_t)0;
         if (EFLAG) fwtmp = sevdwl = (acc_t)0;
         if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcelj, evdwl;
           forcelj = evdwl = (flt_t)0.0;
 
           int j, jtype, sbindex;
           if (!ONETYPE) {
             sbindex = jlist[jj] >> SBBITS & 3;
             j = jlist[jj] & NEIGHMASK;
           } else
             j = jlist[jj];
 
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           if (!ONETYPE) {
             jtype = x[j].w;
             cutsq = ljc12oi[jtype].cutsq;
           }
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
           #ifdef INTEL_VMASK
           if (rsq < cutsq) {
           #endif
             flt_t factor_lj;
             if (!ONETYPE) factor_lj = special_lj[sbindex];
             flt_t r2inv = 1.0 / rsq;
             flt_t r6inv = r2inv * r2inv * r2inv;
             #ifndef INTEL_VMASK
             if (rsq > cutsq) r6inv = (flt_t)0.0;
             #endif
             if (!ONETYPE) {
               lj1 = ljc12oi[jtype].lj1;
               lj2 = ljc12oi[jtype].lj2;
             }
             forcelj = r6inv * (lj1 * r6inv - lj2);
             flt_t fpair;
             if (!ONETYPE)
               fpair = factor_lj * forcelj * r2inv;
             else
               fpair = forcelj * r2inv;
 
             const flt_t fpx = fpair * delx;
             fxtmp += fpx;
             if (NEWTON_PAIR) f[j].x -= fpx;
             const flt_t fpy = fpair * dely;
             fytmp += fpy;
             if (NEWTON_PAIR) f[j].y -= fpy;
             const flt_t fpz = fpair * delz;
             fztmp += fpz;
             if (NEWTON_PAIR) f[j].z -= fpz;
 
             if (EFLAG) {
               if (!ONETYPE) {
                 lj3 = lj34i[jtype].lj3;
                 lj4 = lj34i[jtype].lj4;
                 offset = ljc12oi[jtype].offset;
               }
               evdwl = r6inv * (lj3 * r6inv - lj4);
               #ifdef INTEL_VMASK
               evdwl -= offset;
               #else
               if (rsq < cutsq) evdwl -= offset;
               #endif
               if (!ONETYPE) evdwl *= factor_lj;
               sevdwl += evdwl;
               if (eatom) {
                 fwtmp += (flt_t)0.5 * evdwl;
                 if (NEWTON_PAIR)
                   f[j].w += (flt_t)0.5 * evdwl;
               }
             }
 
             if (NEWTON_PAIR == 0)
               IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           } // if rsq
           #endif
         } // for jj
         if (NEWTON_PAIR) {
           f[i].x += fxtmp;
           f[i].y += fytmp;
           f[i].z += fztmp;
         } else {
           f[i].x = fxtmp;
           f[i].y = fytmp;
           f[i].z = fztmp;
         }
 
         IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
                               f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
                               ov4, ov5);
     } // end omp
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
       ev_global[0] = oevdwl;
       ev_global[1] = (acc_t)0.0;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
         ov3 *= (acc_t)0.5;
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
       ev_global[5] = ov3;
       ev_global[6] = ov4;
       ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end offload
 
   if (offload)
     fix->stop_watch(TIME_OFFLOAD_LATENCY);
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
   if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairLJCutIntel::init_style()
 {
   PairLJCut::init_style();
   if (force->newton_pair == 0) {
     neighbor->requests[neighbor->nrequest-1]->half = 0;
     neighbor->requests[neighbor->nrequest-1]->full = 1;
   }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   if (fix->offload_balance() != 0.0)
     error->all(FLERR,
           "Offload for lj/cut/intel is not yet available. Set balance to 0.");
   #endif
 
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     pack_force_const(force_const_single, fix->get_mixed_buffers());
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     pack_force_const(force_const_double, fix->get_double_buffers());
   else
     pack_force_const(force_const_single, fix->get_single_buffers());
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
 void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
                                       IntelBuffers<flt_t,acc_t> *buffers)
 {
   _onetype = 0;
   if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
 
   int tp1 = atom->ntypes + 1;
   fc.set_ntypes(tp1,memory,_cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
   double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
         cutneigh = cut + neighbor->skin;
         cutsq[i][j] = cutsq[j][i] = cut*cut;
         cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_lj[0] = 1.0;
   }
 
-  for (int i = 0; i < tp1; i++) {
-    for (int j = 0; j < tp1; j++) {
+  for (int i = 1; i < tp1; i++) {
+    for (int j = 1; j < tp1; j++) {
       fc.ljc12o[i][j].lj1 = lj1[i][j];
       fc.ljc12o[i][j].lj2 = lj2[i][j];
       fc.lj34[i][j].lj3 = lj3[i][j];
       fc.lj34[i][j].lj4 = lj4[i][j];
       fc.ljc12o[i][j].cutsq = cutsq[i][j];
       fc.ljc12o[i][j].offset = offset[i][j];
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
 void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                    Memory *memory,
                                                    const int cop) {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
       fc_packed1 *oljc12o = ljc12o[0];
       fc_packed2 *olj34 = lj34[0];
 
       _memory->destroy(oljc12o);
       _memory->destroy(olj34);
     }
     if (ntypes > 0) {
       _cop = cop;
       memory->create(ljc12o,ntypes,ntypes,"fc.c12o");
       memory->create(lj34,ntypes,ntypes,"fc.lj34");
     }
   }
   _ntypes = ntypes;
   _memory = memory;
 }