diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td
index 8f7ae57f44a0751b8f8f321de80c15ba583375ad..32e6f3a50d8e5e468e719a4e006db130b410b9b7 100644
--- a/include/clang/Basic/arm_neon.td
+++ b/include/clang/Basic/arm_neon.td
@@ -11,139 +11,256 @@
 //  file will be generated.  See ARM document DUI0348B.
 //
 //===----------------------------------------------------------------------===//
+//
+// Each intrinsic is a subclass of the Inst class. An intrinsic can either
+// generate a __builtin_* call or it can expand to a set of generic operations.
+//
+// The operations are subclasses of Operation providing a list of DAGs, the
+// last of which is the return value. The available DAG nodes are documented
+// below.
+//
+//===----------------------------------------------------------------------===//
+
+// The base Operation class. All operations must subclass this.
+class Operation<list<dag> ops=[]> {
+  list<dag> Ops = ops;
+  bit Unavailable = 0;
+}
+// An operation that only contains a single DAG.
+class Op<dag op> : Operation<[op]>;
+// A shorter version of Operation - takes a list of DAGs. The last of these will
+// be the return value.
+class LOp<list<dag> ops> : Operation<ops>;
+
+// These defs and classes are used internally to implement the SetTheory
+// expansion and should be ignored.
+foreach Index = 0-63 in
+  def sv##Index;
+class MaskExpand;
+
+//===----------------------------------------------------------------------===//
+// Available operations
+//===----------------------------------------------------------------------===//
+
+// DAG arguments can either be operations (documented below) or variables.
+// Variables are prefixed with '$'. There are variables for each input argument,
+// with the name $pN, where N starts at zero. So the zero'th argument will be
+// $p0, the first $p1 etc.
+
+// op - Binary or unary operator, depending on the number of arguments. The
+//      operator itself is just treated as a raw string and is not checked.
+// example: (op "+", $p0, $p1) -> "__p0 + __p1".
+//          (op "-", $p0)      -> "-__p0"
+def op;
+// call - Invoke another intrinsic. The input types are type checked and
+//        disambiguated. If there is no intrinsic defined that takes
+//        the given types (or if there is a type ambiguity) an error is
+//        generated at tblgen time. The name of the intrinsic is the raw
+//        name as given to the Inst class (not mangled).
+// example: (call "vget_high", $p0) -> "vgetq_high_s16(__p0)"
+//            (assuming $p0 has type int16x8_t).
+def call;
+// cast - Perform a cast to a different type. This gets emitted as a static
+//        C-style cast. For a pure reinterpret cast (T x = *(T*)&y), use
+//        "bitcast".
+//
+//        The syntax is (cast MOD* VAL). The last argument is the value to
+//        cast, preceded by a sequence of type modifiers. The target type
+//        starts off as the type of VAL, and is modified by MOD in sequence.
+//        The available modifiers are:
+//          - $X  - Take the type of parameter/variable X. For example:
+//                  (cast $p0, $p1) would cast $p1 to the type of $p0.
+//          - "R" - The type of the return type.
+//          - A typedef string - A NEON or stdint.h type that is then parsed.
+//                               for example: (cast "uint32x4_t", $p0).
+//          - "U" - Make the type unsigned.
+//          - "S" - Make the type signed.
+//          - "H" - Halve the number of lanes in the type.
+//          - "D" - Double the number of lanes in the type.
+//          - "8" - Convert type to an equivalent vector of 8-bit signed
+//                  integers.
+// example: (cast "R", "U", $p0) -> "(uint32x4_t)__p0" (assuming the return
+//           value is of type "int32x4_t".
+//          (cast $p0, "D", "8", $p1) -> "(int8x16_t)__p1" (assuming __p0
+//           has type float64x1_t or any other vector type of 64 bits).
+//          (cast "int32_t", $p2) -> "(int32_t)__p2"
+def cast;
+// bitcast - Same as "cast", except a reinterpret-cast is produced:
+//             (bitcast "T", $p0) -> "*(T*)&__p0".
+//           The VAL argument is saved to a temprary so it can be used
+//           as an l-value.
+def bitcast;
+// dup - Take a scalar argument and create a vector by duplicating it into
+//       all lanes. The type of the vector is the base type of the intrinsic.
+// example: (dup $p1) -> "(uint32x2_t) {__p1, __p1}" (assuming the base type
+//          is uint32x2_t).
+def dup;
+// splat - Take a vector and a lane index, and return a vector of the same type
+//         containing repeated instances of the source vector at the lane index.
+// example: (splat $p0, $p1) ->
+//            "__builtin_shufflevector(__p0, __p0, __p1, __p1, __p1, __p1)"
+//          (assuming __p0 has four elements).
+def splat;
+// save_temp - Create a temporary (local) variable. The variable takes a name
+//             based on the zero'th parameter and can be referenced using
+//             using that name in subsequent DAGs in the same
+//             operation. The scope of a temp is the operation. If a variable
+//             with the given name already exists, an error will be given at
+//             tblgen time.
+// example: [(save_temp $var, (call "foo", $p0)),
+//           (op "+", $var, $p1)] ->
+//              "int32x2_t __var = foo(__p0); return __var + __p1;"
+def save_temp;
+// name_replace - Return the name of the current intrinsic with the first
+//                argument replaced by the second argument. Raises an error if
+//                the first argument does not exist in the intrinsic name.
+// example: (call (name_replace "_high_", "_"), $p0) (to call the non-high
+//            version of this intrinsic).
+def name_replace;
+// literal - Create a literal piece of code. The code is treated as a raw
+//           string, and must be given a type. The type is a stdint.h or
+//           NEON intrinsic type as given to (cast).
+// example: (literal "int32_t", "0")
+def literal;
+// shuffle - Create a vector shuffle. The syntax is (shuffle ARG0, ARG1, MASK).
+//           The MASK argument is a set of elements. The elements are generated
+//           from the two special defs "mask0" and "mask1". "mask0" expands to
+//           the lane indices in sequence for ARG0, and "mask1" expands to
+//           the lane indices in sequence for ARG1. They can be used as-is, e.g.
+//
+//             (shuffle $p0, $p1, mask0) -> $p0
+//             (shuffle $p0, $p1, mask1) -> $p1
+//
+//           or, more usefully, they can be manipulated using the SetTheory
+//           operators plus some extra operators defined in the NEON emitter.
+//           The operators are described below.
+// example: (shuffle $p0, $p1, (add (highhalf mask0), (highhalf mask1))) ->
+//            A concatenation of the high halves of the input vectors.
+def shuffle;
+
+// add, interleave, decimate: These set operators are vanilla SetTheory
+// operators and take their normal definition.
+def add;
+def interleave;
+def decimate;
+// rotl - Rotate set left by a number of elements.
+// example: (rotl mask0, 3) -> [3, 4, 5, 6, 0, 1, 2]
+def rotl;
+// rotl - Rotate set right by a number of elements.
+// example: (rotr mask0, 3) -> [4, 5, 6, 0, 1, 2, 3]
+def rotr;
+// highhalf - Take only the high half of the input.
+// example: (highhalf mask0) -> [4, 5, 6, 7] (assuming mask0 had 8 elements)
+def highhalf;
+// highhalf - Take only the low half of the input.
+// example: (lowhalf mask0) -> [0, 1, 2, 3] (assuming mask0 had 8 elements)
+def lowhalf;
+// rev - Perform a variable-width reversal of the elements. The zero'th argument
+//       is a width in bits to reverse. The lanes this maps to is determined
+//       based on the element width of the underlying type.
+// example: (rev 32, mask0) -> [3, 2, 1, 0, 7, 6, 5, 4] (if 8-bit elements)
+// example: (rev 32, mask0) -> [1, 0, 3, 2]             (if 16-bit elements)
+def rev;
+// mask0 - The initial sequence of lanes for shuffle ARG0
+def mask0 : MaskExpand;
+// mask0 - The initial sequence of lanes for shuffle ARG1
+def mask1 : MaskExpand;
+
+def OP_NONE  : Operation;
+def OP_UNAVAILABLE : Operation {
+  let Unavailable = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions
+//===----------------------------------------------------------------------===//
+
+// Every intrinsic subclasses "Inst". An intrinsic has a name, a prototype and
+// a sequence of typespecs.
+//
+// The name is the base name of the intrinsic, for example "vget_lane". This is
+// then mangled by the tblgen backend to add type information ("vget_lane_s16").
+//
+// A typespec is a sequence of uppercase characters (modifiers) followed by one
+// lowercase character. A typespec encodes a particular "base type" of the
+// intrinsic.
+//
+// An example typespec is "Qs" - quad-size short - uint16x8_t. The available
+// typespec codes are given below.
+//
+// The string given to an Inst class is a sequence of typespecs. The intrinsic
+// is instantiated for every typespec in the sequence. For example "sdQsQd".
+//
+// The prototype is a string that defines the return type of the intrinsic
+// and the type of each argument. The return type and every argument gets a
+// "modifier" that can change in some way the "base type" of the intrinsic.
+//
+// The modifier 'd' means "default" and does not modify the base type in any
+// way. The available modifiers are given below.
+//
+// Typespecs
+// ---------
+// c: char
+// s: short
+// i: int
+// l: long
+// k: 128-bit long
+// f: float
+// h: half-float
+// d: double
+//
+// Typespec modifiers
+// ------------------
+// S: scalar, only used for function mangling.
+// U: unsigned
+// Q: 128b
+// H: 128b without mangling 'q'
+// P: polynomial
+//
+// Prototype modifiers
+// -------------------
+// prototype: return (arg, arg, ...)
+//
+// v: void
+// t: best-fit integer (int/poly args)
+// x: signed integer   (int/float args)
+// u: unsigned integer (int/float args)
+// f: float (int args)
+// F: double (int args)
+// d: default
+// g: default, ignore 'Q' size modifier.
+// j: default, force 'Q' size modifier.
+// w: double width elements, same num elts
+// n: double width elements, half num elts
+// h: half width elements, double num elts
+// q: half width elements, quad num elts
+// e: half width elements, double num elts, unsigned
+// m: half width elements, same num elts
+// i: constant int
+// l: constant uint64
+// s: scalar of element type
+// z: scalar of half width element type, signed
+// r: scalar of double width element type, signed
+// a: scalar of element type (splat to vector type)
+// b: scalar of unsigned integer/long type (int/float args)
+// $: scalar of signed integer/long type (int/float args)
+// y: scalar of float
+// o: scalar of double
+// k: default elt width, double num elts
+// 2,3,4: array of default vectors
+// B,C,D: array of default elts, force 'Q' size modifier.
+// p: pointer type
+// c: const pointer type
 
-class Op;
-
-def OP_NONE  : Op;
-def OP_UNAVAILABLE : Op;
-def OP_ADD   : Op;
-def OP_ADDL  : Op;
-def OP_ADDLHi : Op;
-def OP_ADDW  : Op;
-def OP_ADDWHi : Op;
-def OP_SUB   : Op;
-def OP_SUBL  : Op;
-def OP_SUBLHi : Op;
-def OP_SUBW  : Op;
-def OP_SUBWHi : Op;
-def OP_MUL   : Op;
-def OP_MLA   : Op;
-def OP_MLAL  : Op;
-def OP_MULLHi : Op;
-def OP_MULLHi_P64 : Op;
-def OP_MULLHi_N : Op;
-def OP_MLALHi : Op;
-def OP_MLALHi_N : Op;
-def OP_MLS   : Op;
-def OP_MLSL  : Op;
-def OP_MLSLHi : Op;
-def OP_MLSLHi_N : Op;
-def OP_MUL_N : Op;
-def OP_MLA_N : Op;
-def OP_MLS_N : Op;
-def OP_FMLA_N : Op;
-def OP_FMLS_N : Op;
-def OP_MLAL_N : Op;
-def OP_MLSL_N : Op;
-def OP_MUL_LN: Op;
-def OP_MULX_LN: Op;
-def OP_MULL_LN : Op;
-def OP_MULLHi_LN : Op;
-def OP_MLA_LN: Op;
-def OP_MLS_LN: Op;
-def OP_MLAL_LN : Op;
-def OP_MLALHi_LN : Op;
-def OP_MLSL_LN : Op;
-def OP_MLSLHi_LN : Op;
-def OP_QDMULL_LN : Op;
-def OP_QDMULLHi_LN : Op;
-def OP_QDMLAL_LN : Op;
-def OP_QDMLALHi_LN : Op;
-def OP_QDMLSL_LN : Op;
-def OP_QDMLSLHi_LN : Op;
-def OP_QDMULH_LN : Op;
-def OP_QRDMULH_LN : Op;
-def OP_FMS_LN : Op;
-def OP_FMS_LNQ : Op;
-def OP_TRN1  : Op;
-def OP_ZIP1  : Op;
-def OP_UZP1  : Op;
-def OP_TRN2  : Op;
-def OP_ZIP2  : Op;
-def OP_UZP2  : Op;
-def OP_EQ    : Op;
-def OP_GE    : Op;
-def OP_LE    : Op;
-def OP_GT    : Op;
-def OP_LT    : Op;
-def OP_NEG   : Op;
-def OP_NOT   : Op;
-def OP_AND   : Op;
-def OP_OR    : Op;
-def OP_XOR   : Op;
-def OP_ANDN  : Op;
-def OP_ORN   : Op;
-def OP_CAST  : Op;
-def OP_HI    : Op;
-def OP_LO    : Op;
-def OP_CONC  : Op;
-def OP_DUP   : Op;
-def OP_DUP_LN: Op;
-def OP_SEL   : Op;
-def OP_REV64 : Op;
-def OP_REV32 : Op;
-def OP_REV16 : Op;
-def OP_XTN : Op;
-def OP_SQXTUN : Op;
-def OP_QXTN : Op;
-def OP_VCVT_NA_HI : Op;
-def OP_VCVT_EX_HI : Op;
-def OP_VCVTX_HI : Op;
-def OP_REINT : Op;
-def OP_ADDHNHi : Op;
-def OP_RADDHNHi : Op;
-def OP_SUBHNHi : Op;
-def OP_RSUBHNHi : Op;
-def OP_ABDL  : Op;
-def OP_ABDLHi : Op;
-def OP_ABA   : Op;
-def OP_ABAL  : Op;
-def OP_ABALHi : Op;
-def OP_QDMULLHi : Op;
-def OP_QDMULLHi_N : Op;
-def OP_QDMLALHi : Op;
-def OP_QDMLALHi_N : Op;
-def OP_QDMLSLHi : Op;
-def OP_QDMLSLHi_N : Op;
-def OP_DIV  : Op;
-def OP_LONG_HI : Op;
-def OP_NARROW_HI : Op;
-def OP_MOVL_HI : Op;
-def OP_COPY_LN : Op;
-def OP_COPYQ_LN : Op;
-def OP_COPY_LNQ : Op;
-def OP_SCALAR_MUL_LN : Op;
-def OP_SCALAR_MUL_LNQ : Op;
-def OP_SCALAR_MULX_LN : Op;
-def OP_SCALAR_MULX_LNQ : Op;
-def OP_SCALAR_VMULX_LN : Op;
-def OP_SCALAR_VMULX_LNQ : Op;
-def OP_SCALAR_QDMULL_LN : Op;
-def OP_SCALAR_QDMULL_LNQ : Op;
-def OP_SCALAR_QDMULH_LN : Op;
-def OP_SCALAR_QDMULH_LNQ : Op;
-def OP_SCALAR_QRDMULH_LN : Op;
-def OP_SCALAR_QRDMULH_LNQ : Op;
-def OP_SCALAR_GET_LN : Op;
-def OP_SCALAR_SET_LN : Op;
-
-class Inst <string n, string p, string t, Op o> {
+// Every intrinsic subclasses Inst.
+class Inst <string n, string p, string t, Operation o> {
   string Name = n;
   string Prototype = p;
   string Types = t;
   string ArchGuard = "";
 
-  Op Operand = o;
+  Operation Operation = o;
+  bit CartesianProductOfTypes = 0;
   bit isShift = 0;
   bit isScalarShift = 0;
   bit isScalarNarrowShift = 0;
@@ -186,60 +303,193 @@ class WInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
 // WOpInst:       Instruction with bit size only suffix (e.g., "8").
 // LOpInst:       Logical instruction with no bit size suffix.
 // NoTestOpInst:  Intrinsic that has no corresponding instruction.
-class SOpInst<string n, string p, string t, Op o> : Inst<n, p, t, o> {}
-class IOpInst<string n, string p, string t, Op o> : Inst<n, p, t, o> {}
-class WOpInst<string n, string p, string t, Op o> : Inst<n, p, t, o> {}
-class LOpInst<string n, string p, string t, Op o> : Inst<n, p, t, o> {}
-class NoTestOpInst<string n, string p, string t, Op o> : Inst<n, p, t, o> {}
+class SOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
+class IOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
+class WOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
+class LOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
+class NoTestOpInst<string n, string p, string t, Operation o> : Inst<n, p, t, o> {}
 
-// prototype: return (arg, arg, ...)
-// v: void
-// t: best-fit integer (int/poly args)
-// x: signed integer   (int/float args)
-// u: unsigned integer (int/float args)
-// f: float (int args)
-// F: double (int args)
-// d: default
-// g: default, ignore 'Q' size modifier.
-// j: default, force 'Q' size modifier.
-// w: double width elements, same num elts
-// n: double width elements, half num elts
-// h: half width elements, double num elts
-// q: half width elements, quad num elts
-// e: half width elements, double num elts, unsigned
-// m: half width elements, same num elts
-// i: constant int
-// l: constant uint64
-// s: scalar of element type
-// z: scalar of half width element type, signed
-// r: scalar of double width element type, signed
-// a: scalar of element type (splat to vector type)
-// b: scalar of unsigned integer/long type (int/float args)
-// $: scalar of signed integer/long type (int/float args)
-// y: scalar of float
-// o: scalar of double
-// k: default elt width, double num elts
-// 2,3,4: array of default vectors
-// B,C,D: array of default elts, force 'Q' size modifier.
-// p: pointer type
-// c: const pointer type
+//===----------------------------------------------------------------------===//
+// Operations
+//===----------------------------------------------------------------------===//
 
-// sizes:
-// c: char
-// s: short
-// i: int
-// l: long
-// k: 128-bit long
-// f: float
-// h: half-float
-// d: double
+def OP_ADD      : Op<(op "+", $p0, $p1)>;
+def OP_ADDL     : Op<(op "+", (call "vmovl", $p0), (call "vmovl", $p1))>;
+def OP_ADDLHi   : Op<(op "+", (call "vmovl_high", $p0),
+                              (call "vmovl_high", $p1))>;
+def OP_ADDW     : Op<(op "+", $p0, (call "vmovl", $p1))>;
+def OP_ADDWHi   : Op<(op "+", $p0, (call "vmovl_high", $p1))>;
+def OP_SUB      : Op<(op "-", $p0, $p1)>;
+def OP_SUBL     : Op<(op "-", (call "vmovl", $p0), (call "vmovl", $p1))>;
+def OP_SUBLHi   : Op<(op "-", (call "vmovl_high", $p0),
+                              (call "vmovl_high", $p1))>;
+def OP_SUBW     : Op<(op "-", $p0, (call "vmovl", $p1))>;
+def OP_SUBWHi   : Op<(op "-", $p0, (call "vmovl_high", $p1))>;
+def OP_MUL      : Op<(op "*", $p0, $p1)>;
+def OP_MLA      : Op<(op "+", $p0, (op "*", $p1, $p2))>;
+def OP_MLAL     : Op<(op "+", $p0, (call "vmull", $p1, $p2))>;
+def OP_MULLHi   : Op<(call "vmull", (call "vget_high", $p0),
+                                    (call "vget_high", $p1))>;
+def OP_MULLHi_P64 : Op<(call "vmull",
+                         (cast "poly64_t", (call "vget_high", $p0)),
+                         (cast "poly64_t", (call "vget_high", $p1)))>;
+def OP_MULLHi_N : Op<(call "vmull_n", (call "vget_high", $p0), $p1)>;
+def OP_MLALHi   : Op<(call "vmlal", $p0, (call "vget_high", $p1),
+                                         (call "vget_high", $p2))>;
+def OP_MLALHi_N : Op<(call "vmlal_n", $p0, (call "vget_high", $p1), $p2)>;
+def OP_MLS      : Op<(op "-", $p0, (op "*", $p1, $p2))>;
+def OP_MLSL     : Op<(op "-", $p0, (call "vmull", $p1, $p2))>;
+def OP_MLSLHi   : Op<(call "vmlsl", $p0, (call "vget_high", $p1),
+                                         (call "vget_high", $p2))>;
+def OP_MLSLHi_N : Op<(call "vmlsl_n", $p0, (call "vget_high", $p1), $p2)>;
+def OP_MUL_N    : Op<(op "*", $p0, (dup $p1))>;
+def OP_MLA_N    : Op<(op "+", $p0, (op "*", $p1, (dup $p2)))>;
+def OP_MLS_N    : Op<(op "-", $p0, (op "*", $p1, (dup $p2)))>;
+def OP_FMLA_N   : Op<(call "vfma", $p0, $p1, (dup $p2))>;
+def OP_FMLS_N   : Op<(call "vfms", $p0, $p1, (dup $p2))>;
+def OP_MLAL_N   : Op<(op "+", $p0, (call "vmull", $p1, (dup $p2)))>;
+def OP_MLSL_N   : Op<(op "-", $p0, (call "vmull", $p1, (dup $p2)))>;
+def OP_MUL_LN   : Op<(op "*", $p0, (splat $p1, $p2))>;
+def OP_MULX_LN  : Op<(call "vmulx", $p0, (splat $p1, $p2))>;
+def OP_MULL_LN  : Op<(call "vmull", $p0, (splat $p1, $p2))>;
+def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (splat $p1, $p2))>;
+def OP_MLA_LN   : Op<(op "+", $p0, (op "*", $p1, (splat $p2, $p3)))>;
+def OP_MLS_LN   : Op<(op "-", $p0, (op "*", $p1, (splat $p2, $p3)))>;
+def OP_MLAL_LN  : Op<(op "+", $p0, (call "vmull", $p1, (splat $p2, $p3)))>;
+def OP_MLALHi_LN: Op<(op "+", $p0, (call "vmull", (call "vget_high", $p1),
+                                                  (splat $p2, $p3)))>;
+def OP_MLSL_LN  : Op<(op "-", $p0, (call "vmull", $p1, (splat $p2, $p3)))>;
+def OP_MLSLHi_LN : Op<(op "-", $p0, (call "vmull", (call "vget_high", $p1),
+                                                   (splat $p2, $p3)))>;
+def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (splat $p1, $p2))>;
+def OP_QDMULLHi_LN : Op<(call "vqdmull", (call "vget_high", $p0),
+                                         (splat $p1, $p2))>;
+def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (splat $p2, $p3))>;
+def OP_QDMLALHi_LN : Op<(call "vqdmlal", $p0, (call "vget_high", $p1),
+                                              (splat $p2, $p3))>;
+def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (splat $p2, $p3))>;
+def OP_QDMLSLHi_LN : Op<(call "vqdmlsl", $p0, (call "vget_high", $p1),
+                                              (splat $p2, $p3))>;
+def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (splat $p1, $p2))>;
+def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (splat $p1, $p2))>;
+def OP_FMS_LN   : Op<(call "vfma_lane", $p0, $p1, (op "-", $p2), $p3)>;
+def OP_FMS_LNQ  : Op<(call "vfma_laneq", $p0, $p1, (op "-", $p2), $p3)>;
+def OP_TRN1     : Op<(shuffle $p0, $p1, (interleave (decimate mask0, 2),
+                                                    (decimate mask1, 2)))>;
+def OP_ZIP1     : Op<(shuffle $p0, $p1, (lowhalf (interleave mask0, mask1)))>;
+def OP_UZP1     : Op<(shuffle $p0, $p1, (add (decimate mask0, 2),
+                                             (decimate mask1, 2)))>;
+def OP_TRN2     : Op<(shuffle $p0, $p1, (interleave
+                                          (decimate (rotl mask0, 1), 2),
+                                          (decimate (rotl mask1, 1), 2)))>;
+def OP_ZIP2     : Op<(shuffle $p0, $p1, (highhalf (interleave mask0, mask1)))>;
+def OP_UZP2     : Op<(shuffle $p0, $p1, (add (decimate (rotl mask0, 1), 2),
+                                             (decimate (rotl mask1, 1), 2)))>;
+def OP_EQ       : Op<(cast "R", (op "==", $p0, $p1))>;
+def OP_GE       : Op<(cast "R", (op ">=", $p0, $p1))>;
+def OP_LE       : Op<(cast "R", (op "<=", $p0, $p1))>;
+def OP_GT       : Op<(cast "R", (op ">", $p0, $p1))>;
+def OP_LT       : Op<(cast "R", (op "<", $p0, $p1))>;
+def OP_NEG      : Op<(op "-", $p0)>;
+def OP_NOT      : Op<(op "~", $p0)>;
+def OP_AND      : Op<(op "&", $p0, $p1)>;
+def OP_OR       : Op<(op "|", $p0, $p1)>;
+def OP_XOR      : Op<(op "^", $p0, $p1)>;
+def OP_ANDN     : Op<(op "&", $p0, (op "~", $p1))>;
+def OP_ORN      : Op<(op "|", $p0, (op "~", $p1))>;
+def OP_CAST     : Op<(cast "R", $p0)>;
+def OP_HI       : Op<(shuffle $p0, $p0, (highhalf mask0))>;
+def OP_LO       : Op<(shuffle $p0, $p0, (lowhalf mask0))>;
+def OP_CONC     : Op<(shuffle $p0, $p1, (add mask0, mask1))>;
+def OP_DUP      : Op<(dup $p0)>;
+def OP_DUP_LN   : Op<(splat $p0, $p1)>;
+def OP_SEL      : Op<(cast "R", (op "|",
+                                    (op "&", $p0, (cast $p0, $p1)),
+                                    (op "&", (op "~", $p0), (cast $p0, $p2))))>;
+def OP_REV16    : Op<(shuffle $p0, $p0, (rev 16, mask0))>;
+def OP_REV32    : Op<(shuffle $p0, $p0, (rev 32, mask0))>;
+def OP_REV64    : Op<(shuffle $p0, $p0, (rev 64, mask0))>;
+def OP_XTN      : Op<(call "vcombine", $p0, (call "vmovn", $p1))>;
+def OP_SQXTUN   : Op<(call "vcombine", (cast $p0, "U", $p0),
+                                       (call "vqmovun", $p1))>;
+def OP_QXTN     : Op<(call "vcombine", $p0, (call "vqmovn", $p1))>;
+def OP_VCVT_NA_HI_F16 : Op<(call "vcombine", $p0, (call "vcvt_f16", $p1))>;
+def OP_VCVT_NA_HI_F32 : Op<(call "vcombine", $p0, (call "vcvt_f32_f64", $p1))>;
+def OP_VCVT_EX_HI_F32 : Op<(call "vcvt_f32_f16", (call "vget_high", $p0))>;
+def OP_VCVT_EX_HI_F64 : Op<(call "vcvt_f64_f32", (call "vget_high", $p0))>;
+def OP_VCVTX_HI : Op<(call "vcombine", $p0, (call "vcvtx_f32", $p1))>;
+def OP_REINT    : Op<(cast "R", $p0)>;
+def OP_ADDHNHi  : Op<(call "vcombine", $p0, (call "vaddhn", $p1, $p2))>;
+def OP_RADDHNHi : Op<(call "vcombine", $p0, (call "vraddhn", $p1, $p2))>;
+def OP_SUBHNHi  : Op<(call "vcombine", $p0, (call "vsubhn", $p1, $p2))>;
+def OP_RSUBHNHi : Op<(call "vcombine", $p0, (call "vrsubhn", $p1, $p2))>;
+def OP_ABDL     : Op<(cast "R", (call "vmovl", (cast $p0, "U",
+                                                     (call "vabd", $p0, $p1))))>;
+def OP_ABDLHi   : Op<(call "vabdl", (call "vget_high", $p0),
+                                    (call "vget_high", $p1))>;
+def OP_ABA      : Op<(op "+", $p0, (call "vabd", $p1, $p2))>;
+def OP_ABAL     : Op<(op "+", $p0, (call "vabdl", $p1, $p2))>;
+def OP_ABALHi   : Op<(call "vabal", $p0, (call "vget_high", $p1),
+                                       (call "vget_high", $p2))>;
+def OP_QDMULLHi : Op<(call "vqdmull", (call "vget_high", $p0),
+                                      (call "vget_high", $p1))>;
+def OP_QDMULLHi_N : Op<(call "vqdmull_n", (call "vget_high", $p0), $p1)>;
+def OP_QDMLALHi : Op<(call "vqdmlal", $p0, (call "vget_high", $p1),
+                                           (call "vget_high", $p2))>;
+def OP_QDMLALHi_N : Op<(call "vqdmlal_n", $p0, (call "vget_high", $p1), $p2)>;
+def OP_QDMLSLHi : Op<(call "vqdmlsl", $p0, (call "vget_high", $p1),
+                                           (call "vget_high", $p2))>;
+def OP_QDMLSLHi_N : Op<(call "vqdmlsl_n", $p0, (call "vget_high", $p1), $p2)>;
+def OP_DIV  : Op<(op "/", $p0, $p1)>;
+def OP_LONG_HI : Op<(cast "R", (call (name_replace "_high_", "_"),
+                                                (call "vget_high", $p0), $p1))>;
+def OP_NARROW_HI : Op<(cast "R", (call "vcombine",
+                                       (cast "R", "H", $p0),
+                                       (cast "R", "H",
+                                           (call (name_replace "_high_", "_"),
+                                                 $p1, $p2))))>;
+def OP_MOVL_HI  : LOp<[(save_temp $a1, (call "vget_high", $p0)),
+                       (cast "R",
+                            (call "vshll_n", $a1, (literal "int32_t", "0")))]>;
+def OP_COPY_LN : Op<(call "vset_lane", (call "vget_lane", $p2, $p3), $p0, $p1)>;
+def OP_SCALAR_MUL_LN : Op<(op "*", $p0, (call "vget_lane", $p1, $p2))>;
+def OP_SCALAR_MULX_LN : Op<(call "vmulx", $p0, (call "vget_lane", $p1, $p2))>;
+def OP_SCALAR_VMULX_LN : LOp<[(save_temp $x, (call "vget_lane", $p0,
+                                                    (literal "int32_t", "0"))),
+                              (save_temp $y, (call "vget_lane", $p1, $p2)),
+                              (save_temp $z, (call "vmulx", $x, $y)),
+                              (call "vset_lane", $z, $p0, $p2)]>;
+def OP_SCALAR_VMULX_LNQ : LOp<[(save_temp $x, (call "vget_lane", $p0,
+                                                     (literal "int32_t", "0"))),
+                               (save_temp $y, (call "vget_lane", $p1, $p2)),
+                               (save_temp $z, (call "vmulx", $x, $y)),
+                               (call "vset_lane", $z, $p0, (literal "int32_t",
+                                                                     "0"))]>;
+class ScalarMulOp<string opname> :
+  Op<(call opname, $p0, (call "vget_lane", $p1, $p2))>;
+
+def OP_SCALAR_QDMULL_LN : ScalarMulOp<"vqdmull">;
+def OP_SCALAR_QDMULH_LN : ScalarMulOp<"vqdmulh">;
+def OP_SCALAR_QRDMULH_LN : ScalarMulOp<"vqrdmulh">;
+
+def OP_SCALAR_HALF_GET_LN : Op<(bitcast "float16_t",
+                                   (call "vget_lane",
+                                         (bitcast "int16x4_t", $p0), $p1))>;
+def OP_SCALAR_HALF_GET_LNQ : Op<(bitcast "float16_t",
+                                    (call "vget_lane",
+                                          (bitcast "int16x8_t", $p0), $p1))>;
+def OP_SCALAR_HALF_SET_LN : Op<(bitcast "float16x4_t",
+                                   (call "vset_lane",
+                                         (bitcast "int16_t", $p0),
+                                         (bitcast "int16x4_t", $p1), $p2))>;
+def OP_SCALAR_HALF_SET_LNQ : Op<(bitcast "float16x8_t",
+                                    (call "vset_lane",
+                                          (bitcast "int16_t", $p0),
+                                          (bitcast "int16x8_t", $p1), $p2))>;
 
-// size modifiers:
-// S: scalar, only used for function mangling.
-// U: unsigned
-// Q: 128b
-// H: 128b without mangling 'q'
-// P: polynomial
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.1 Addition
@@ -538,7 +788,10 @@ def VUZP : WInst<"vuzp", "2dd", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">;
 // E.3.31 Vector reinterpret cast operations
 def VREINTERPRET
   : NoTestOpInst<"vreinterpret", "dd",
-         "csilUcUsUiUlhfPcPsQcQsQiQlQUcQUsQUiQUlQhQfQPcQPs", OP_REINT>;
+         "csilUcUsUiUlhfPcPsQcQsQiQlQUcQUsQUiQUlQhQfQPcQPs", OP_REINT> {
+  let CartesianProductOfTypes = 1;
+  let ArchGuard = "__ARM_ARCH < 8";
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Vector fused multiply-add operations
@@ -678,13 +931,13 @@ def QXTN2 : SOpInst<"vqmovn_high", "qhk", "silUsUiUl", OP_QXTN>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Converting vectors
-def VCVT_HIGH_F16 : SOpInst<"vcvt_high_f16", "qhj", "f", OP_VCVT_NA_HI>;
-def VCVT_HIGH_F32_F16 : SOpInst<"vcvt_high_f32", "wk", "h", OP_VCVT_EX_HI>;
-def VCVT_F32_F64 : SInst<"vcvt_f32_f64", "mj", "d">;
-def VCVT_HIGH_F32_F64 : SOpInst<"vcvt_high_f32", "qfj", "d", OP_VCVT_NA_HI>;
+def VCVT_HIGH_F16 : SOpInst<"vcvt_high_f16", "qhj", "f", OP_VCVT_NA_HI_F16>;
+def VCVT_HIGH_F32_F16 : SOpInst<"vcvt_high_f32", "wk", "h", OP_VCVT_EX_HI_F32>;
+def VCVT_F32_F64 : SInst<"vcvt_f32_f64", "md", "Qd">;
+def VCVT_HIGH_F32_F64 : SOpInst<"vcvt_high_f32", "qfj", "d", OP_VCVT_NA_HI_F32>;
 def VCVT_F64_F32 : SInst<"vcvt_f64_f32", "wd", "f">;
 def VCVT_F64 : SInst<"vcvt_f64", "Fd",  "lUlQlQUl">;
-def VCVT_HIGH_F64_F32 : SOpInst<"vcvt_high_f64", "wj", "f", OP_VCVT_EX_HI>;
+def VCVT_HIGH_F64_F32 : SOpInst<"vcvt_high_f64", "wj", "f", OP_VCVT_EX_HI_F64>;
 def VCVTX_F32_F64 : SInst<"vcvtx_f32", "fj",  "d">;
 def VCVTX_HIGH_F32_F64 : SOpInst<"vcvtx_high_f32", "qfj", "d", OP_VCVTX_HI>;
 def FRINTN : SInst<"vrndn", "dd", "fdQfQd">;
@@ -819,16 +1072,16 @@ def SET_LANE : IInst<"vset_lane", "dsdi", "dQdPlQPl">;
 def COPY_LANE : IOpInst<"vcopy_lane", "ddidi",
                         "csilUcUsUiUlPcPsPlfd", OP_COPY_LN>;
 def COPYQ_LANE : IOpInst<"vcopy_lane", "ddigi",
-                        "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPYQ_LN>;
+                        "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>;
 def COPY_LANEQ : IOpInst<"vcopy_laneq", "ddiki",
-                     "csilPcPsPlUcUsUiUlfd", OP_COPY_LNQ>;
+                     "csilPcPsPlUcUsUiUlfd", OP_COPY_LN>;
 def COPYQ_LANEQ : IOpInst<"vcopy_laneq", "ddidi",
                      "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Set all lanes to same value
 def VDUP_LANE1: WOpInst<"vdup_lane", "dgi", "hdQhQdPlQPl", OP_DUP_LN>;
-def VDUP_LANE2: WOpInst<"vdup_laneq", "dki",
+def VDUP_LANE2: WOpInst<"vdup_laneq", "dji",
                   "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPl",
                         OP_DUP_LN>;
 def DUP_N   : WOpInst<"vdup_n", "ds", "dQdPlQPl", OP_DUP>;
@@ -999,14 +1252,12 @@ def VQTBX4_A64 : WInst<"vqtbx4", "ddDt", "UccPcQUcQcQPc">;
 // NeonEmitter implicitly takes the cartesian product of the type string with
 // itself during generation so, unlike all other intrinsics, this one should
 // include *all* types, not just additional ones.
-//
-// We also rely on NeonEmitter handling the 32-bit vreinterpret before the
-// 64-bit one so that the common casts don't get guarded as AArch64-only
-// (FIXME).
 def VVREINTERPRET
   : NoTestOpInst<"vreinterpret", "dd",
-       "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", OP_REINT>;
-
+       "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", OP_REINT> {
+  let CartesianProductOfTypes = 1;
+  let ArchGuard = "__ARM_ARCH >= 8 && defined(__aarch64__)";
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Intrinsics
@@ -1261,11 +1512,11 @@ def SCALAR_UQXTN : SInst<"vqmovn", "zs", "SUsSUiSUl">;
 
 // Scalar Floating Point  multiply (scalar, by element)
 def SCALAR_FMUL_LANE : IOpInst<"vmul_lane", "ssdi", "SfSd", OP_SCALAR_MUL_LN>;
-def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "ssji", "SfSd", OP_SCALAR_MUL_LNQ>;
+def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "ssji", "SfSd", OP_SCALAR_MUL_LN>;
 
 // Scalar Floating Point  multiply extended (scalar, by element)
 def SCALAR_FMULX_LANE : IOpInst<"vmulx_lane", "ssdi", "SfSd", OP_SCALAR_MULX_LN>;
-def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "ssji", "SfSd", OP_SCALAR_MULX_LNQ>;
+def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "ssji", "SfSd", OP_SCALAR_MULX_LN>;
 
 def SCALAR_VMUL_N : IInst<"vmul_n", "dds", "d">;
 
@@ -1293,7 +1544,7 @@ def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "sssji", "SfSd", OP_FMS_LNQ>;
 
 // Signed Saturating Doubling Multiply Long (scalar by element)
 def SCALAR_SQDMULL_LANE : SOpInst<"vqdmull_lane", "rsdi", "SsSi", OP_SCALAR_QDMULL_LN>;
-def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "rsji", "SsSi", OP_SCALAR_QDMULL_LNQ>;
+def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "rsji", "SsSi", OP_SCALAR_QDMULL_LN>;
 
 // Signed Saturating Doubling Multiply-Add Long (scalar by element)
 def SCALAR_SQDMLAL_LANE : SInst<"vqdmlal_lane", "rrsdi", "SsSi">;
@@ -1305,15 +1556,18 @@ def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "rrsji", "SsSi">;
 
 // Scalar Integer Saturating Doubling Multiply Half High (scalar by element)
 def SCALAR_SQDMULH_LANE : SOpInst<"vqdmulh_lane", "ssdi", "SsSi", OP_SCALAR_QDMULH_LN>;
-def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "ssji", "SsSi", OP_SCALAR_QDMULH_LNQ>;
+def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "ssji", "SsSi", OP_SCALAR_QDMULH_LN>;
 
 // Scalar Integer Saturating Rounding Doubling Multiply Half High
 def SCALAR_SQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "ssdi", "SsSi", OP_SCALAR_QRDMULH_LN>;
-def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "ssji", "SsSi", OP_SCALAR_QRDMULH_LNQ>;
+def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "ssji", "SsSi", OP_SCALAR_QRDMULH_LN>;
 
 def SCALAR_VDUP_LANE : IInst<"vdup_lane", "sdi", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
 def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "sji", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
 
-def SCALAR_GET_LANE : IOpInst<"vget_lane", "sdi", "hQh", OP_SCALAR_GET_LN>;
-def SCALAR_SET_LANE : IOpInst<"vset_lane", "dsdi", "hQh", OP_SCALAR_SET_LN>;
+// FIXME: Rename so it is obvious this only applies to halfs.
+def SCALAR_HALF_GET_LANE : IOpInst<"vget_lane", "sdi", "h", OP_SCALAR_HALF_GET_LN>;
+def SCALAR_HALF_SET_LANE : IOpInst<"vset_lane", "dsdi", "h", OP_SCALAR_HALF_SET_LN>;
+def SCALAR_HALF_GET_LANEQ : IOpInst<"vget_lane", "sdi", "Qh", OP_SCALAR_HALF_GET_LNQ>;
+def SCALAR_HALF_SET_LANEQ : IOpInst<"vset_lane", "dsdi", "Qh", OP_SCALAR_HALF_SET_LNQ>;
 }
diff --git a/test/CodeGen/arm64_vcvtfp.c b/test/CodeGen/arm64_vcvtfp.c
index 79c37adad4447209e247fc85b35e6c1359bb25bb..e3dca8159931e2dad167d42076f5b7ee7acc0996 100644
--- a/test/CodeGen/arm64_vcvtfp.c
+++ b/test/CodeGen/arm64_vcvtfp.c
@@ -44,5 +44,5 @@ float32x4_t test_vcvtx_high_f32_f64(float32x2_t x, float64x2_t v) {
   return vcvtx_high_f32_f64(x, v);
   // CHECK: llvm.aarch64.neon.fcvtxn.v2f32.v2f64
   // CHECK: shufflevector
-  // CHECK-NEXT: ret
+  // CHECK: ret
 }
diff --git a/test/Sema/arm-neon-types.c b/test/Sema/arm-neon-types.c
index a49de12d4485302e660a342828fa767849f3b630..a5ee708b50344260b6161e8197fbc5b4a205564b 100644
--- a/test/Sema/arm-neon-types.c
+++ b/test/Sema/arm-neon-types.c
@@ -17,7 +17,7 @@ float32x2_t test2(uint32x2_t x) {
 float32x2_t test3(uint32x2_t x) {
   // FIXME: The "incompatible result type" error is due to pr10112 and should be
   // removed when that is fixed.
-  return vcvt_n_f32_u32(x, 0); // expected-error {{argument should be a value from 1 to 32}} expected-error {{incompatible result type}}
+  return vcvt_n_f32_u32(x, 0); // expected-error {{argument should be a value from 1 to 32}}
 }
 
 typedef signed int vSInt32 __attribute__((__vector_size__(16)));
diff --git a/test/Sema/arm64-neon-args.c b/test/Sema/arm64-neon-args.c
index 9bd103a4dbb215c65f85df2879b40a0215773af3..315a7044040ee19eb530c5a8e35ab15f6148a76f 100644
--- a/test/Sema/arm64-neon-args.c
+++ b/test/Sema/arm64-neon-args.c
@@ -5,7 +5,7 @@
 
 // rdar://13527900
 void vcopy_reject(float32x4_t vOut0, float32x4_t vAlpha, int t) {
-  vcopyq_laneq_f32(vOut0, 1, vAlpha, t); // expected-error {{argument to '__builtin_neon_vgetq_lane_f32' must be a constant integer}} expected-error {{initializing 'float32_t' (aka 'float') with an expression of incompatible type 'void'}}
+  vcopyq_laneq_f32(vOut0, 1, vAlpha, t); // expected-error {{argument to '__builtin_neon_vgetq_lane_f32' must be a constant integer}}
 }
 
 // rdar://problem/15256199
diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp
index 6dfdcb30ff24f460c72c20b4d907c7755cd4af9e..8dcb73cc686416dcf1983b6c379544e2118bf4a0 100644
--- a/utils/TableGen/NeonEmitter.cpp
+++ b/utils/TableGen/NeonEmitter.cpp
@@ -18,8 +18,9 @@
 // CodeGen library.
 //
 // Additional validation code can be generated by this file when runHeader() is
-// called, rather than the normal run() entry point.  A complete set of tests
-// for Neon intrinsics can be generated by calling the runTests() entry point.
+// called, rather than the normal run() entry point.
+//
+// See also the documentation in include/clang/Basic/arm_neon.td.
 //
 //===----------------------------------------------------------------------===//
 
@@ -31,318 +32,456 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include <string>
+#include <sstream>
+#include <vector>
+#include <map>
+#include <algorithm>
 using namespace llvm;
 
-enum OpKind {
-  OpNone,
-  OpUnavailable,
-  OpAdd,
-  OpAddl,
-  OpAddlHi,
-  OpAddw,
-  OpAddwHi,
-  OpSub,
-  OpSubl,
-  OpSublHi,
-  OpSubw,
-  OpSubwHi,
-  OpMul,
-  OpMla,
-  OpMlal,
-  OpMullHi,
-  OpMullHiP64,
-  OpMullHiN,
-  OpMlalHi,
-  OpMlalHiN,
-  OpMls,
-  OpMlsl,
-  OpMlslHi,
-  OpMlslHiN,
-  OpMulN,
-  OpMlaN,
-  OpMlsN,
-  OpFMlaN,
-  OpFMlsN,
-  OpMlalN,
-  OpMlslN,
-  OpMulLane,
-  OpMulXLane,
-  OpMullLane,
-  OpMullHiLane,
-  OpMlaLane,
-  OpMlsLane,
-  OpMlalLane,
-  OpMlalHiLane,
-  OpMlslLane,
-  OpMlslHiLane,
-  OpQDMullLane,
-  OpQDMullHiLane,
-  OpQDMlalLane,
-  OpQDMlalHiLane,
-  OpQDMlslLane,
-  OpQDMlslHiLane,
-  OpQDMulhLane,
-  OpQRDMulhLane,
-  OpFMSLane,
-  OpFMSLaneQ,
-  OpTrn1,
-  OpZip1,
-  OpUzp1,
-  OpTrn2,
-  OpZip2,
-  OpUzp2,
-  OpEq,
-  OpGe,
-  OpLe,
-  OpGt,
-  OpLt,
-  OpNeg,
-  OpNot,
-  OpAnd,
-  OpOr,
-  OpXor,
-  OpAndNot,
-  OpOrNot,
-  OpCast,
-  OpConcat,
-  OpDup,
-  OpDupLane,
-  OpHi,
-  OpLo,
-  OpSelect,
-  OpRev16,
-  OpRev32,
-  OpRev64,
-  OpXtnHi,
-  OpSqxtunHi,
-  OpQxtnHi,
-  OpFcvtnHi,
-  OpFcvtlHi,
-  OpFcvtxnHi,
-  OpReinterpret,
-  OpAddhnHi,
-  OpRAddhnHi,
-  OpSubhnHi,
-  OpRSubhnHi,
-  OpAbdl,
-  OpAbdlHi,
-  OpAba,
-  OpAbal,
-  OpAbalHi,
-  OpQDMullHi,
-  OpQDMullHiN,
-  OpQDMlalHi,
-  OpQDMlalHiN,
-  OpQDMlslHi,
-  OpQDMlslHiN,
-  OpDiv,
-  OpLongHi,
-  OpNarrowHi,
-  OpMovlHi,
-  OpCopyLane,
-  OpCopyQLane,
-  OpCopyLaneQ,
-  OpScalarMulLane,
-  OpScalarMulLaneQ,
-  OpScalarMulXLane,
-  OpScalarMulXLaneQ,
-  OpScalarVMulXLane,
-  OpScalarVMulXLaneQ,
-  OpScalarQDMullLane,
-  OpScalarQDMullLaneQ,
-  OpScalarQDMulHiLane,
-  OpScalarQDMulHiLaneQ,
-  OpScalarQRDMulHiLane,
-  OpScalarQRDMulHiLaneQ,
-  OpScalarGetLane,
-  OpScalarSetLane
-};
+namespace {
+
+// While globals are generally bad, this one allows us to perform assertions
+// liberally and somehow still trace them back to the def they indirectly
+// came from.
+static Record *CurrentRecord = nullptr;
+static void assert_with_loc(bool Assertion, const std::string &Str) {
+  if (!Assertion) {
+    if (CurrentRecord)
+      PrintFatalError(CurrentRecord->getLoc(), Str);
+    else
+      PrintFatalError(Str);
+  }
+}
 
 enum ClassKind {
   ClassNone,
-  ClassI,           // generic integer instruction, e.g., "i8" suffix
-  ClassS,           // signed/unsigned/poly, e.g., "s8", "u8" or "p8" suffix
-  ClassW,           // width-specific instruction, e.g., "8" suffix
-  ClassB,           // bitcast arguments with enum argument to specify type
-  ClassL,           // Logical instructions which are op instructions
-                    // but we need to not emit any suffix for in our
-                    // tests.
-  ClassNoTest       // Instructions which we do not test since they are
-                    // not TRUE instructions.
+  ClassI,     // generic integer instruction, e.g., "i8" suffix
+  ClassS,     // signed/unsigned/poly, e.g., "s8", "u8" or "p8" suffix
+  ClassW,     // width-specific instruction, e.g., "8" suffix
+  ClassB,     // bitcast arguments with enum argument to specify type
+  ClassL,     // Logical instructions which are op instructions
+              // but we need to not emit any suffix for in our
+              // tests.
+  ClassNoTest // Instructions which we do not test since they are
+              // not TRUE instructions.
 };
 
 /// NeonTypeFlags - Flags to identify the types for overloaded Neon
 /// builtins.  These must be kept in sync with the flags in
 /// include/clang/Basic/TargetBuiltins.h.
-namespace {
-class NeonTypeFlags {
-  enum {
-    EltTypeMask = 0xf,
-    UnsignedFlag = 0x10,
-    QuadFlag = 0x20
-  };
-  uint32_t Flags;
+namespace NeonTypeFlags {
+enum { EltTypeMask = 0xf, UnsignedFlag = 0x10, QuadFlag = 0x20 };
+
+enum EltType {
+  Int8,
+  Int16,
+  Int32,
+  Int64,
+  Poly8,
+  Poly16,
+  Poly64,
+  Poly128,
+  Float16,
+  Float32,
+  Float64
+};
+}
+
+class Intrinsic;
+class NeonEmitter;
+class Type;
+class Variable;
 
+//===----------------------------------------------------------------------===//
+// TypeSpec
+//===----------------------------------------------------------------------===//
+
+/// A TypeSpec is just a simple wrapper around a string, but gets its own type
+/// for strong typing purposes.
+///
+/// A TypeSpec can be used to create a type.
+class TypeSpec : public std::string {
 public:
-  enum EltType {
-    Int8,
-    Int16,
-    Int32,
-    Int64,
-    Poly8,
-    Poly16,
-    Poly64,
-    Poly128,
-    Float16,
-    Float32,
-    Float64
-  };
+  static std::vector<TypeSpec> fromTypeSpecs(StringRef Str) {
+    std::vector<TypeSpec> Ret;
+    TypeSpec Acc;
+    for (char I : Str.str()) {
+      if (islower(I)) {
+        Acc.push_back(I);
+        Ret.push_back(TypeSpec(Acc));
+        Acc.clear();
+      } else {
+        Acc.push_back(I);
+      }
+    }
+    return Ret;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Type
+//===----------------------------------------------------------------------===//
+
+/// A Type. Not much more to say here.
+class Type {
+private:
+  TypeSpec TS;
+
+  bool Float, Signed, Void, Poly, Constant, Pointer;
+  // ScalarForMangling and NoManglingQ are really not suited to live here as
+  // they are not related to the type. But they live in the TypeSpec (not the
+  // prototype), so this is really the only place to store them.
+  bool ScalarForMangling, NoManglingQ;
+  unsigned Bitwidth, ElementBitwidth, NumVectors;
+
+public:
+  Type()
+      : Float(false), Signed(false), Void(true), Poly(false), Constant(false),
+        Pointer(false), ScalarForMangling(false), NoManglingQ(false),
+        Bitwidth(0), ElementBitwidth(0), NumVectors(0) {}
+
+  Type(TypeSpec TS, char CharMod)
+      : TS(TS), Float(false), Signed(false), Void(false), Poly(false),
+        Constant(false), Pointer(false), ScalarForMangling(false),
+        NoManglingQ(false), Bitwidth(0), ElementBitwidth(0), NumVectors(0) {
+    applyModifier(CharMod);
+  }
+
+  /// Returns a type representing "void".
+  static Type getVoid() { return Type(); }
 
-  NeonTypeFlags(unsigned F) : Flags(F) {}
-  NeonTypeFlags(EltType ET, bool IsUnsigned, bool IsQuad) : Flags(ET) {
-    if (IsUnsigned)
-      Flags |= UnsignedFlag;
-    if (IsQuad)
-      Flags |= QuadFlag;
+  bool operator==(const Type &Other) const { return str() == Other.str(); }
+  bool operator!=(const Type &Other) const { return !operator==(Other); }
+
+  //
+  // Query functions
+  //
+  bool isScalarForMangling() const { return ScalarForMangling; }
+  bool noManglingQ() const { return NoManglingQ; }
+
+  bool isPointer() const { return Pointer; }
+  bool isFloating() const { return Float; }
+  bool isInteger() const { return !Float && !Poly; }
+  bool isSigned() const { return Signed; }
+  bool isScalar() const { return NumVectors == 0; }
+  bool isVector() const { return NumVectors > 0; }
+  bool isFloat() const { return Float && ElementBitwidth == 32; }
+  bool isDouble() const { return Float && ElementBitwidth == 64; }
+  bool isHalf() const { return Float && ElementBitwidth == 16; }
+  bool isPoly() const { return Poly; }
+  bool isChar() const { return ElementBitwidth == 8; }
+  bool isShort() const { return !Float && ElementBitwidth == 16; }
+  bool isInt() const { return !Float && ElementBitwidth == 32; }
+  bool isLong() const { return !Float && ElementBitwidth == 64; }
+  bool isVoid() const { return Void; }
+  unsigned getNumElements() const { return Bitwidth / ElementBitwidth; }
+  unsigned getSizeInBits() const { return Bitwidth; }
+  unsigned getElementSizeInBits() const { return ElementBitwidth; }
+  unsigned getNumVectors() const { return NumVectors; }
+
+  //
+  // Mutator functions
+  //
+  void makeUnsigned() { Signed = false; }
+  void makeSigned() { Signed = true; }
+  void makeInteger(unsigned ElemWidth, bool Sign) {
+    Float = false;
+    Poly = false;
+    Signed = Sign;
+    ElementBitwidth = ElemWidth;
+  }
+  void makeScalar() {
+    Bitwidth = ElementBitwidth;
+    NumVectors = 0;
+  }
+  void makeOneVector() {
+    assert(isVector());
+    NumVectors = 1;
   }
+  void doubleLanes() {
+    assert_with_loc(Bitwidth != 128, "Can't get bigger than 128!");
+    Bitwidth = 128;
+  }
+  void halveLanes() {
+    assert_with_loc(Bitwidth != 64, "Can't get smaller than 64!");
+    Bitwidth = 64;
+  }
+
+  /// Return the C string representation of a type, which is the typename
+  /// defined in stdint.h or arm_neon.h.
+  std::string str() const;
+
+  /// Return the string representation of a type, which is an encoded
+  /// string for passing to the BUILTIN() macro in Builtins.def.
+  std::string builtin_str() const;
 
-  uint32_t getFlags() const { return Flags; }
+  /// Return the value in NeonTypeFlags for this type.
+  unsigned getNeonEnum() const;
+
+  /// Parse a type from a stdint.h or arm_neon.h typedef name,
+  /// for example uint32x2_t or int64_t.
+  static Type fromTypedefName(StringRef Name);
+
+private:
+  /// Creates the type based on the typespec string in TS.
+  /// Sets "Quad" to true if the "Q" or "H" modifiers were
+  /// seen. This is needed by applyModifier as some modifiers
+  /// only take effect if the type size was changed by "Q" or "H".
+  void applyTypespec(bool &Quad);
+  /// Applies a prototype modifier to the type.
+  void applyModifier(char Mod);
 };
-} // end anonymous namespace
 
-namespace {
+//===----------------------------------------------------------------------===//
+// Variable
+//===----------------------------------------------------------------------===//
+
+/// A variable is a simple class that just has a type and a name.
+class Variable {
+  Type T;
+  std::string N;
+
+public:
+  Variable() : T(Type::getVoid()), N("") {}
+  Variable(Type T, std::string N) : T(T), N(N) {}
+
+  Type getType() const { return T; }
+  std::string getName() const { return "__" + N; }
+};
+
+//===----------------------------------------------------------------------===//
+// Intrinsic
+//===----------------------------------------------------------------------===//
+
+/// The main grunt class. This represents an instantiation of an intrinsic with
+/// a particular typespec and prototype.
+class Intrinsic {
+  /// The Record this intrinsic was created from.
+  Record *R;
+  /// The unmangled name and prototype.
+  std::string Name, Proto;
+  /// The input and output typespecs. InTS == OutTS except when
+  /// CartesianProductOfTypes is 1 - this is the case for vreinterpret.
+  TypeSpec OutTS, InTS;
+  /// The base class kind. Most intrinsics use ClassS, which has full type
+  /// info for integers (s32/u32). Some use ClassI, which doesn't care about
+  /// signedness (i32), while some (ClassB) have no type at all, only a width
+  /// (32).
+  ClassKind CK;
+  /// The list of DAGs for the body. May be empty, in which case we should
+  /// emit a builtin call.
+  ListInit *Body;
+  /// The architectural #ifdef guard.
+  std::string Guard;
+  /// Set if the Unvailable bit is 1. This means we don't generate a body,
+  /// just an "unavailable" attribute on a declaration.
+  bool IsUnavailable;
+
+  /// The types of return value [0] and parameters [1..].
+  std::vector<Type> Types;
+  /// The local variables defined.
+  std::map<std::string, Variable> Variables;
+  /// NeededEarly - set if any other intrinsic depends on this intrinsic.
+  bool NeededEarly;
+  /// UseMacro - set if we should implement using a macro or unset for a
+  ///            function.
+  bool UseMacro;
+  /// The set of intrinsics that this intrinsic uses/requires.
+  std::set<Intrinsic *> Dependencies;
+  /// The "base type", which is Type('d', OutTS). InBaseType is only
+  /// different if CartesianProductOfTypes = 1 (for vreinterpret).
+  Type BaseType, InBaseType;
+  /// The return variable.
+  Variable RetVar;
+  /// A postfix to apply to every variable. Defaults to "".
+  std::string VariablePostfix;
+
+  NeonEmitter &Emitter;
+  std::stringstream OS;
+
+public:
+  Intrinsic(Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS,
+            TypeSpec InTS, ClassKind CK, ListInit *Body, NeonEmitter &Emitter,
+            StringRef Guard, bool IsUnavailable)
+      : R(R), Name(Name.str()), Proto(Proto.str()), OutTS(OutTS), InTS(InTS),
+        CK(CK), Body(Body), Guard(Guard.str()), IsUnavailable(IsUnavailable),
+        NeededEarly(false), UseMacro(false), BaseType(OutTS, 'd'),
+        InBaseType(InTS, 'd'), Emitter(Emitter) {
+    // If this builtin takes an immediate argument, we need to #define it rather
+    // than use a standard declaration, so that SemaChecking can range check
+    // the immediate passed by the user.
+    if (Proto.find('i') != std::string::npos)
+      UseMacro = true;
+
+    // Pointer arguments need to use macros to avoid hiding aligned attributes
+    // from the pointer type.
+    if (Proto.find('p') != std::string::npos ||
+        Proto.find('c') != std::string::npos)
+      UseMacro = true;
+
+    // It is not permitted to pass or return an __fp16 by value, so intrinsics
+    // taking a scalar float16_t must be implemented as macros.
+    if (OutTS.find('h') != std::string::npos &&
+        Proto.find('s') != std::string::npos)
+      UseMacro = true;
+
+    // Modify the TypeSpec per-argument to get a concrete Type, and create
+    // known variables for each.
+    // Types[0] is the return value.
+    Types.push_back(Type(OutTS, Proto[0]));
+    for (unsigned I = 1; I < Proto.size(); ++I)
+      Types.push_back(Type(InTS, Proto[I]));
+  }
+
+  /// Get the Record that this intrinsic is based off.
+  Record *getRecord() const { return R; }
+  /// Get the set of Intrinsics that this intrinsic calls.
+  /// this is the set of immediate dependencies, NOT the
+  /// transitive closure.
+  const std::set<Intrinsic *> &getDependencies() const { return Dependencies; }
+  /// Get the architectural guard string (#ifdef).
+  std::string getGuard() const { return Guard; }
+  /// Get the non-mangled name.
+  std::string getName() const { return Name; }
+
+  /// Return true if the intrinsic takes an immediate operand.
+  bool hasImmediate() const {
+    return Proto.find('i') != std::string::npos;
+  }
+  /// Return the parameter index of the immediate operand.
+  unsigned getImmediateIdx() const {
+    assert(hasImmediate());
+    unsigned Idx = Proto.find('i');
+    assert(Idx > 0 && "Can't return an immediate!");
+    return Idx - 1;
+  }
+
+  /// Return true if the intrinsic takes an splat operand.
+  bool hasSplat() const { return Proto.find('a') != std::string::npos; }
+  /// Return the parameter index of the splat operand.
+  unsigned getSplatIdx() const {
+    assert(hasSplat());
+    unsigned Idx = Proto.find('a');
+    assert(Idx > 0 && "Can't return a splat!");
+    return Idx - 1;
+  }
+
+  unsigned getNumParams() const { return Proto.size() - 1; }
+  Type getReturnType() const { return Types[0]; }
+  Type getParamType(unsigned I) const { return Types[I + 1]; }
+  Type getBaseType() const { return BaseType; }
+  /// Return the raw prototype string.
+  std::string getProto() const { return Proto; }
+
+  /// Return true if the prototype has a scalar argument.
+  /// This does not return true for the "splat" code ('a').
+  bool protoHasScalar();
+
+  /// Return the index that parameter PIndex will sit at
+  /// in a generated function call. This is often just PIndex,
+  /// but may not be as things such as multiple-vector operands
+  /// and sret parameters need to be taken into accont.
+  unsigned getGeneratedParamIdx(unsigned PIndex) {
+    unsigned Idx = 0;
+    if (getReturnType().getNumVectors() > 1)
+      // Multiple vectors are passed as sret.
+      ++Idx;
+
+    for (unsigned I = 0; I < PIndex; ++I)
+      Idx += std::max(1U, getParamType(I).getNumVectors());
+
+    return Idx;
+  }
+
+  bool hasBody() const { return Body && Body->getValues().size() > 0; }
+
+  void setNeededEarly() { NeededEarly = true; }
+
+  bool operator<(const Intrinsic &Other) const {
+    // Sort lexicographically on a two-tuple (Guard, Name)
+    if (Guard != Other.Guard)
+      return Guard < Other.Guard;
+    return Name < Other.Name;
+  }
+
+  ClassKind getClassKind(bool UseClassBIfScalar = false) {
+    if (UseClassBIfScalar && !protoHasScalar())
+      return ClassB;
+    return CK;
+  }
+
+  /// Return the name, mangled with type information.
+  /// If ForceClassS is true, use ClassS (u32/s32) instead
+  /// of the intrinsic's own type class.
+  std::string getMangledName(bool ForceClassS = false);
+  /// Return the type code for a builtin function call.
+  std::string getInstTypeCode(Type T, ClassKind CK);
+  /// Return the type string for a BUILTIN() macro in Builtins.def.
+  std::string getBuiltinTypeStr();
+
+  /// Generate the intrinsic, returning code.
+  std::string generate();
+  /// Perform type checking and populate the dependency graph, but
+  /// don't generate code yet.
+  void indexBody();
+
+private:
+  std::string mangleName(std::string Name, ClassKind CK);
+
+  void initVariables();
+  std::string replaceParamsIn(std::string S);
+
+  void emitBodyAsBuiltinCall();
+  std::pair<Type, std::string> emitDagArg(Init *Arg, std::string ArgName);
+  std::pair<Type, std::string> emitDagSaveTemp(DagInit *DI);
+  std::pair<Type, std::string> emitDagSplat(DagInit *DI);
+  std::pair<Type, std::string> emitDagDup(DagInit *DI);
+  std::pair<Type, std::string> emitDagShuffle(DagInit *DI);
+  std::pair<Type, std::string> emitDagCast(DagInit *DI, bool IsBitCast);
+  std::pair<Type, std::string> emitDagCall(DagInit *DI);
+  std::pair<Type, std::string> emitDagNameReplace(DagInit *DI);
+  std::pair<Type, std::string> emitDagLiteral(DagInit *DI);
+  std::pair<Type, std::string> emitDagOp(DagInit *DI);
+  std::pair<Type, std::string> emitDag(DagInit *DI);
+
+  void emitReturn();
+  void emitBody();
+  void emitShadowedArgs();
+  void emitNewLine();
+  void emitClosingBrace();
+  void emitOpeningBrace();
+  void emitPrototype();
+};
+
+//===----------------------------------------------------------------------===//
+// NeonEmitter
+//===----------------------------------------------------------------------===//
+
 class NeonEmitter {
   RecordKeeper &Records;
-  StringMap<OpKind> OpMap;
-  DenseMap<Record*, ClassKind> ClassMap;
+  DenseMap<Record *, ClassKind> ClassMap;
+  std::map<std::string, std::vector<Intrinsic *>> IntrinsicMap;
+  unsigned UniqueNumber;
+
+  void createIntrinsic(Record *R, SmallVectorImpl<Intrinsic *> &Out);
+  void genBuiltinsDef(raw_ostream &OS, SmallVectorImpl<Intrinsic *> &Defs);
+  void genOverloadTypeCheckCode(raw_ostream &OS,
+                                SmallVectorImpl<Intrinsic *> &Defs);
+  void genIntrinsicRangeCheckCode(raw_ostream &OS,
+                                  SmallVectorImpl<Intrinsic *> &Defs);
 
 public:
-  NeonEmitter(RecordKeeper &R) : Records(R) {
-    OpMap["OP_NONE"]  = OpNone;
-    OpMap["OP_UNAVAILABLE"] = OpUnavailable;
-    OpMap["OP_ADD"]   = OpAdd;
-    OpMap["OP_ADDL"]  = OpAddl;
-    OpMap["OP_ADDLHi"] = OpAddlHi;
-    OpMap["OP_ADDW"]  = OpAddw;
-    OpMap["OP_ADDWHi"] = OpAddwHi;
-    OpMap["OP_SUB"]   = OpSub;
-    OpMap["OP_SUBL"]  = OpSubl;
-    OpMap["OP_SUBLHi"] = OpSublHi;
-    OpMap["OP_SUBW"]  = OpSubw;
-    OpMap["OP_SUBWHi"] = OpSubwHi;
-    OpMap["OP_MUL"]   = OpMul;
-    OpMap["OP_MLA"]   = OpMla;
-    OpMap["OP_MLAL"]  = OpMlal;
-    OpMap["OP_MULLHi"]  = OpMullHi;
-    OpMap["OP_MULLHi_P64"]  = OpMullHiP64;
-    OpMap["OP_MULLHi_N"]  = OpMullHiN;
-    OpMap["OP_MLALHi"]  = OpMlalHi;
-    OpMap["OP_MLALHi_N"]  = OpMlalHiN;
-    OpMap["OP_MLS"]   = OpMls;
-    OpMap["OP_MLSL"]  = OpMlsl;
-    OpMap["OP_MLSLHi"] = OpMlslHi;
-    OpMap["OP_MLSLHi_N"] = OpMlslHiN;
-    OpMap["OP_MUL_N"] = OpMulN;
-    OpMap["OP_MLA_N"] = OpMlaN;
-    OpMap["OP_MLS_N"] = OpMlsN;
-    OpMap["OP_FMLA_N"] = OpFMlaN;
-    OpMap["OP_FMLS_N"] = OpFMlsN;
-    OpMap["OP_MLAL_N"] = OpMlalN;
-    OpMap["OP_MLSL_N"] = OpMlslN;
-    OpMap["OP_MUL_LN"]= OpMulLane;
-    OpMap["OP_MULX_LN"]= OpMulXLane;
-    OpMap["OP_MULL_LN"] = OpMullLane;
-    OpMap["OP_MULLHi_LN"] = OpMullHiLane;
-    OpMap["OP_MLA_LN"]= OpMlaLane;
-    OpMap["OP_MLS_LN"]= OpMlsLane;
-    OpMap["OP_MLAL_LN"] = OpMlalLane;
-    OpMap["OP_MLALHi_LN"] = OpMlalHiLane;
-    OpMap["OP_MLSL_LN"] = OpMlslLane;
-    OpMap["OP_MLSLHi_LN"] = OpMlslHiLane;
-    OpMap["OP_QDMULL_LN"] = OpQDMullLane;
-    OpMap["OP_QDMULLHi_LN"] = OpQDMullHiLane;
-    OpMap["OP_QDMLAL_LN"] = OpQDMlalLane;
-    OpMap["OP_QDMLALHi_LN"] = OpQDMlalHiLane;
-    OpMap["OP_QDMLSL_LN"] = OpQDMlslLane;
-    OpMap["OP_QDMLSLHi_LN"] = OpQDMlslHiLane;
-    OpMap["OP_QDMULH_LN"] = OpQDMulhLane;
-    OpMap["OP_QRDMULH_LN"] = OpQRDMulhLane;
-    OpMap["OP_FMS_LN"] = OpFMSLane;
-    OpMap["OP_FMS_LNQ"] = OpFMSLaneQ;
-    OpMap["OP_TRN1"]  = OpTrn1;
-    OpMap["OP_ZIP1"]  = OpZip1;
-    OpMap["OP_UZP1"]  = OpUzp1;
-    OpMap["OP_TRN2"]  = OpTrn2;
-    OpMap["OP_ZIP2"]  = OpZip2;
-    OpMap["OP_UZP2"]  = OpUzp2;
-    OpMap["OP_EQ"]    = OpEq;
-    OpMap["OP_GE"]    = OpGe;
-    OpMap["OP_LE"]    = OpLe;
-    OpMap["OP_GT"]    = OpGt;
-    OpMap["OP_LT"]    = OpLt;
-    OpMap["OP_NEG"]   = OpNeg;
-    OpMap["OP_NOT"]   = OpNot;
-    OpMap["OP_AND"]   = OpAnd;
-    OpMap["OP_OR"]    = OpOr;
-    OpMap["OP_XOR"]   = OpXor;
-    OpMap["OP_ANDN"]  = OpAndNot;
-    OpMap["OP_ORN"]   = OpOrNot;
-    OpMap["OP_CAST"]  = OpCast;
-    OpMap["OP_CONC"]  = OpConcat;
-    OpMap["OP_HI"]    = OpHi;
-    OpMap["OP_LO"]    = OpLo;
-    OpMap["OP_DUP"]   = OpDup;
-    OpMap["OP_DUP_LN"] = OpDupLane;
-    OpMap["OP_SEL"]   = OpSelect;
-    OpMap["OP_REV16"] = OpRev16;
-    OpMap["OP_REV32"] = OpRev32;
-    OpMap["OP_REV64"] = OpRev64;
-    OpMap["OP_XTN"] = OpXtnHi;
-    OpMap["OP_SQXTUN"] = OpSqxtunHi;
-    OpMap["OP_QXTN"] = OpQxtnHi;
-    OpMap["OP_VCVT_NA_HI"] = OpFcvtnHi;
-    OpMap["OP_VCVT_EX_HI"] = OpFcvtlHi;
-    OpMap["OP_VCVTX_HI"] = OpFcvtxnHi;
-    OpMap["OP_REINT"] = OpReinterpret;
-    OpMap["OP_ADDHNHi"] = OpAddhnHi;
-    OpMap["OP_RADDHNHi"] = OpRAddhnHi;
-    OpMap["OP_SUBHNHi"] = OpSubhnHi;
-    OpMap["OP_RSUBHNHi"] = OpRSubhnHi;
-    OpMap["OP_ABDL"]  = OpAbdl;
-    OpMap["OP_ABDLHi"] = OpAbdlHi;
-    OpMap["OP_ABA"]   = OpAba;
-    OpMap["OP_ABAL"]  = OpAbal;
-    OpMap["OP_ABALHi"] = OpAbalHi;
-    OpMap["OP_QDMULLHi"] = OpQDMullHi;
-    OpMap["OP_QDMULLHi_N"] = OpQDMullHiN;
-    OpMap["OP_QDMLALHi"] = OpQDMlalHi;
-    OpMap["OP_QDMLALHi_N"] = OpQDMlalHiN;
-    OpMap["OP_QDMLSLHi"] = OpQDMlslHi;
-    OpMap["OP_QDMLSLHi_N"] = OpQDMlslHiN;
-    OpMap["OP_DIV"] = OpDiv;
-    OpMap["OP_LONG_HI"] = OpLongHi;
-    OpMap["OP_NARROW_HI"] = OpNarrowHi;
-    OpMap["OP_MOVL_HI"] = OpMovlHi;
-    OpMap["OP_COPY_LN"] = OpCopyLane;
-    OpMap["OP_COPYQ_LN"] = OpCopyQLane;
-    OpMap["OP_COPY_LNQ"] = OpCopyLaneQ;
-    OpMap["OP_SCALAR_MUL_LN"]= OpScalarMulLane;
-    OpMap["OP_SCALAR_MUL_LNQ"]= OpScalarMulLaneQ;
-    OpMap["OP_SCALAR_MULX_LN"]= OpScalarMulXLane;
-    OpMap["OP_SCALAR_MULX_LNQ"]= OpScalarMulXLaneQ;
-    OpMap["OP_SCALAR_VMULX_LN"]= OpScalarVMulXLane;
-    OpMap["OP_SCALAR_VMULX_LNQ"]= OpScalarVMulXLaneQ;
-    OpMap["OP_SCALAR_QDMULL_LN"] = OpScalarQDMullLane;
-    OpMap["OP_SCALAR_QDMULL_LNQ"] = OpScalarQDMullLaneQ;
-    OpMap["OP_SCALAR_QDMULH_LN"] = OpScalarQDMulHiLane;
-    OpMap["OP_SCALAR_QDMULH_LNQ"] = OpScalarQDMulHiLaneQ;
-    OpMap["OP_SCALAR_QRDMULH_LN"] = OpScalarQRDMulHiLane;
-    OpMap["OP_SCALAR_QRDMULH_LNQ"] = OpScalarQRDMulHiLaneQ;
-    OpMap["OP_SCALAR_GET_LN"] = OpScalarGetLane;
-    OpMap["OP_SCALAR_SET_LN"] = OpScalarSetLane;
+  /// Called by Intrinsic - this attempts to get an intrinsic that takes
+  /// the given types as arguments.
+  Intrinsic *getIntrinsic(StringRef Name, ArrayRef<Type> Types);
 
+  /// Called by Intrinsic - returns a globally-unique number.
+  unsigned getUniqueNumber() { return UniqueNumber++; }
+
+  NeonEmitter(RecordKeeper &R) : Records(R), UniqueNumber(0) {
     Record *SI = R.getClass("SInst");
     Record *II = R.getClass("IInst");
     Record *WI = R.getClass("WInst");
@@ -370,2856 +509,1545 @@ public:
 
   // runTests - Emit tests for all the Neon intrinsics.
   void runTests(raw_ostream &o);
-
-private:
-  void emitGuardedIntrinsic(raw_ostream &OS, Record *R,
-                            std::string &CurrentGuard, bool &InGuard,
-                            StringMap<ClassKind> &EmittedMap);
-  void emitIntrinsic(raw_ostream &OS, Record *R,
-                     StringMap<ClassKind> &EmittedMap);
-  void genBuiltinsDef(raw_ostream &OS);
-  void genOverloadTypeCheckCode(raw_ostream &OS);
-  void genIntrinsicRangeCheckCode(raw_ostream &OS);
-  void genTargetTest(raw_ostream &OS);
 };
+
 } // end anonymous namespace
 
-/// ParseTypes - break down a string such as "fQf" into a vector of StringRefs,
-/// which each StringRef representing a single type declared in the string.
-/// for "fQf" we would end up with 2 StringRefs, "f", and "Qf", representing
-/// 2xfloat and 4xfloat respectively.
-static void ParseTypes(Record *r, std::string &s,
-                       SmallVectorImpl<StringRef> &TV) {
-  const char *data = s.data();
-  int len = 0;
-
-  for (unsigned i = 0, e = s.size(); i != e; ++i, ++len) {
-    if (data[len] == 'P' || data[len] == 'Q' || data[len] == 'U'
-                         || data[len] == 'H' || data[len] == 'S')
-      continue;
+//===----------------------------------------------------------------------===//
+// Type implementation
+//===----------------------------------------------------------------------===//
 
-    switch (data[len]) {
-      case 'c':
-      case 's':
-      case 'i':
-      case 'l':
-      case 'k':
-      case 'h':
-      case 'f':
-      case 'd':
-        break;
-      default:
-        PrintFatalError(r->getLoc(),
-                      "Unexpected letter: " + std::string(data + len, 1));
-    }
-    TV.push_back(StringRef(data, len + 1));
-    data += len + 1;
-    len = -1;
-  }
+std::string Type::str() const {
+  if (Void)
+    return "void";
+  std::string S;
+
+  if (!Signed && isInteger())
+    S += "u";
+
+  if (Poly)
+    S += "poly";
+  else if (Float)
+    S += "float";
+  else
+    S += "int";
+
+  S += utostr(ElementBitwidth);
+  if (isVector())
+    S += "x" + utostr(getNumElements());
+  if (NumVectors > 1)
+    S += "x" + utostr(NumVectors);
+  S += "_t";
+
+  if (Constant)
+    S += " const";
+  if (Pointer)
+    S += " *";
+
+  return S;
 }
 
-/// Widen - Convert a type code into the next wider type.  char -> short,
-/// short -> int, etc.
-static char Widen(const char t) {
-  switch (t) {
-    case 'c':
-      return 's';
-    case 's':
-      return 'i';
-    case 'i':
-      return 'l';
-    case 'l':
-      return 'k';
-    case 'h':
-      return 'f';
-    case 'f':
-      return 'd';
-    default:
-      PrintFatalError("unhandled type in widen!");
+std::string Type::builtin_str() const {
+  std::string S;
+  if (isVoid())
+    return "v";
+
+  if (Pointer)
+    // All pointers are void pointers.
+    S += "v";
+  else if (isInteger())
+    switch (ElementBitwidth) {
+    case 8: S += "c"; break;
+    case 16: S += "s"; break;
+    case 32: S += "i"; break;
+    case 64: S += "Wi"; break;
+    case 128: S += "LLLi"; break;
+    default: assert(0 && "Unhandled case!");
+    }
+  else
+    switch (ElementBitwidth) {
+    case 16: S += "h"; break;
+    case 32: S += "f"; break;
+    case 64: S += "d"; break;
+    default: assert(0 && "Unhandled case!");
+    }
+
+  if (isChar() && !Pointer)
+    // Make chars explicitly signed.
+    S = "S" + S;
+  else if (isInteger() && !Pointer && !Signed)
+    S = "U" + S;
+
+  if (isScalar()) {
+    if (Constant) S += "C";
+    if (Pointer) S += "*";
+    return S;
   }
+
+  std::string Ret;
+  for (unsigned I = 0; I < NumVectors; ++I)
+    Ret += "V" + utostr(getNumElements()) + S;
+
+  return Ret;
 }
 
-/// Narrow - Convert a type code into the next smaller type.  short -> char,
-/// float -> half float, etc.
-static char Narrow(const char t) {
-  switch (t) {
-    case 's':
-      return 'c';
-    case 'i':
-      return 's';
-    case 'l':
-      return 'i';
-    case 'k':
-      return 'l';
-    case 'f':
-      return 'h';
-    case 'd':
-      return 'f';
-    default:
-      PrintFatalError("unhandled type in narrow!");
+unsigned Type::getNeonEnum() const {
+  unsigned Addend;
+  switch (ElementBitwidth) {
+  case 8: Addend = 0; break;
+  case 16: Addend = 1; break;
+  case 32: Addend = 2; break;
+  case 64: Addend = 3; break;
+  case 128: Addend = 4; break;
+  default: assert(0 && "Unhandled element bitwidth!");
   }
-}
 
-static std::string GetNarrowTypestr(StringRef ty)
-{
-  std::string s;
-  for (size_t i = 0, end = ty.size(); i < end; i++) {
-    switch (ty[i]) {
-      case 's':
-        s += 'c';
-        break;
-      case 'i':
-        s += 's';
-        break;
-      case 'l':
-        s += 'i';
-        break;
-      case 'k':
-        s += 'l';
-        break;
-      default:
-        s += ty[i];
-        break;
-    }
+  unsigned Base = (unsigned)NeonTypeFlags::Int8 + Addend;
+  if (Poly) {
+    // Adjustment needed because Poly32 doesn't exist.
+    if (Addend >= 2)
+      --Addend;
+    Base = (unsigned)NeonTypeFlags::Poly8 + Addend;
+  }
+  if (Float) {
+    assert(Addend != 0 && "Float8 doesn't exist!");
+    Base = (unsigned)NeonTypeFlags::Float16 + (Addend - 1);
   }
 
-  return s;
+  if (Bitwidth == 128)
+    Base |= (unsigned)NeonTypeFlags::QuadFlag;
+  if (isInteger() && !Signed)
+    Base |= (unsigned)NeonTypeFlags::UnsignedFlag;
+
+  return Base;
 }
 
-/// For a particular StringRef, return the base type code, and whether it has
-/// the quad-vector, polynomial, or unsigned modifiers set.
-static char ClassifyType(StringRef ty, bool &quad, bool &poly, bool &usgn) {
-  unsigned off = 0;
-  // ignore scalar.
-  if (ty[off] == 'S') {
-    ++off;
+Type Type::fromTypedefName(StringRef Name) {
+  Type T;
+  T.Void = false;
+  T.Float = false;
+  T.Poly = false;
+
+  if (Name.front() == 'u') {
+    T.Signed = false;
+    Name = Name.drop_front();
+  } else {
+    T.Signed = true;
   }
-  // remember quad.
-  if (ty[off] == 'Q' || ty[off] == 'H') {
-    quad = true;
-    ++off;
+
+  if (Name.startswith("float")) {
+    T.Float = true;
+    Name = Name.drop_front(5);
+  } else if (Name.startswith("poly")) {
+    T.Poly = true;
+    Name = Name.drop_front(4);
+  } else {
+    assert(Name.startswith("int"));
+    Name = Name.drop_front(3);
   }
 
-  // remember poly.
-  if (ty[off] == 'P') {
-    poly = true;
-    ++off;
+  unsigned I = 0;
+  for (I = 0; I < Name.size(); ++I) {
+    if (!isdigit(Name[I]))
+      break;
   }
+  Name.substr(0, I).getAsInteger(10, T.ElementBitwidth);
+  Name = Name.drop_front(I);
 
-  // remember unsigned.
-  if (ty[off] == 'U') {
-    usgn = true;
-    ++off;
+  T.Bitwidth = T.ElementBitwidth;
+  T.NumVectors = 1;
+
+  if (Name.front() == 'x') {
+    Name = Name.drop_front();
+    unsigned I = 0;
+    for (I = 0; I < Name.size(); ++I) {
+      if (!isdigit(Name[I]))
+        break;
+    }
+    unsigned NumLanes;
+    Name.substr(0, I).getAsInteger(10, NumLanes);
+    Name = Name.drop_front(I);
+    T.Bitwidth = T.ElementBitwidth * NumLanes;
+  } else {
+    // Was scalar.
+    T.NumVectors = 0;
+  }
+  if (Name.front() == 'x') {
+    Name = Name.drop_front();
+    unsigned I = 0;
+    for (I = 0; I < Name.size(); ++I) {
+      if (!isdigit(Name[I]))
+        break;
+    }
+    Name.substr(0, I).getAsInteger(10, T.NumVectors);
+    Name = Name.drop_front(I);
   }
 
-  // base type to get the type string for.
-  return ty[off];
+  assert(Name.startswith("_t") && "Malformed typedef!");
+  return T;
 }
 
-/// ModType - Transform a type code and its modifiers based on a mod code. The
-/// mod code definitions may be found at the top of arm_neon.td.
-static char ModType(const char mod, char type, bool &quad, bool &poly,
-                    bool &usgn, bool &scal, bool &cnst, bool &pntr) {
-  switch (mod) {
-    case 't':
-      if (poly) {
-        poly = false;
-        usgn = true;
-      }
-      break;
-    case 'b':
-      scal = true;
-    case 'u':
-      usgn = true;
-      poly = false;
-      if (type == 'f')
-        type = 'i';
-      if (type == 'd')
-        type = 'l';
-      break;
-    case '$':
-      scal = true;
-    case 'x':
-      usgn = false;
-      poly = false;
-      if (type == 'f')
-        type = 'i';
-      if (type == 'd')
-        type = 'l';
-      break;
-    case 'o':
-      scal = true;
-      type = 'd';
-      usgn = false;
-      break;
-    case 'y':
-      scal = true;
-    case 'f':
-      if (type == 'h')
-        quad = true;
-      type = 'f';
-      usgn = false;
-      break;
-    case 'F':
-      type = 'd';
-      usgn = false;
-      break;
-    case 'g':
-      quad = false;
-      break;
-    case 'B':
-    case 'C':
-    case 'D':
-    case 'j':
-      quad = true;
-      break;
-    case 'w':
-      type = Widen(type);
-      quad = true;
-      break;
-    case 'n':
-      type = Widen(type);
-      break;
-    case 'i':
-      type = 'i';
-      scal = true;
-      break;
-    case 'l':
-      type = 'l';
-      scal = true;
-      usgn = true;
+void Type::applyTypespec(bool &Quad) {
+  std::string S = TS;
+  ScalarForMangling = false;
+  Void = false;
+  Poly = Float = false;
+  ElementBitwidth = ~0U;
+  Signed = true;
+  NumVectors = 1;
+
+  for (char I : S) {
+    switch (I) {
+    case 'S':
+      ScalarForMangling = true;
       break;
-    case 'z':
-      type = Narrow(type);
-      scal = true;
+    case 'H':
+      NoManglingQ = true;
+      Quad = true;
       break;
-    case 'r':
-      type = Widen(type);
-      scal = true;
+    case 'Q':
+      Quad = true;
       break;
-    case 's':
-    case 'a':
-      scal = true;
+    case 'P':
+      Poly = true;
       break;
-    case 'k':
-      quad = true;
+    case 'U':
+      Signed = false;
       break;
     case 'c':
-      cnst = true;
-    case 'p':
-      pntr = true;
-      scal = true;
+      ElementBitwidth = 8;
       break;
     case 'h':
-      type = Narrow(type);
-      if (type == 'h')
-        quad = false;
-      break;
-    case 'q':
-      type = Narrow(type);
-      quad = true;
-      break;
-    case 'e':
-      type = Narrow(type);
-      usgn = true;
-      break;
-    case 'm':
-      type = Narrow(type);
-      quad = false;
-      break;
-    default:
-      break;
-  }
-  return type;
-}
-
-static bool IsMultiVecProto(const char p) {
-  return ((p >= '2' && p <= '4') || (p >= 'B' && p <= 'D'));
-}
-
-/// TypeString - for a modifier and type, generate the name of the typedef for
-/// that type.  QUc -> uint8x8_t.
-static std::string TypeString(const char mod, StringRef typestr) {
-  bool quad = false;
-  bool poly = false;
-  bool usgn = false;
-  bool scal = false;
-  bool cnst = false;
-  bool pntr = false;
-
-  if (mod == 'v')
-    return "void";
-  if (mod == 'i')
-    return "int";
-
-  // base type to get the type string for.
-  char type = ClassifyType(typestr, quad, poly, usgn);
-
-  // Based on the modifying character, change the type and width if necessary.
-  type = ModType(mod, type, quad, poly, usgn, scal, cnst, pntr);
-
-  SmallString<128> s;
-
-  if (usgn)
-    s.push_back('u');
-
-  switch (type) {
-    case 'c':
-      s += poly ? "poly8" : "int8";
-      if (scal)
-        break;
-      s += quad ? "x16" : "x8";
-      break;
+      Float = true;
+    // Fall through
     case 's':
-      s += poly ? "poly16" : "int16";
-      if (scal)
-        break;
-      s += quad ? "x8" : "x4";
+      ElementBitwidth = 16;
       break;
+    case 'f':
+      Float = true;
+    // Fall through
     case 'i':
-      s += "int32";
-      if (scal)
-        break;
-      s += quad ? "x4" : "x2";
+      ElementBitwidth = 32;
       break;
+    case 'd':
+      Float = true;
+    // Fall through
     case 'l':
-      s += (poly && !usgn)? "poly64" : "int64";
-      if (scal)
-        break;
-      s += quad ? "x2" : "x1";
+      ElementBitwidth = 64;
       break;
     case 'k':
-      s += "poly128";
-      break;
-    case 'h':
-      s += "float16";
-      if (scal)
-        break;
-      s += quad ? "x8" : "x4";
-      break;
-    case 'f':
-      s += "float32";
-      if (scal)
-        break;
-      s += quad ? "x4" : "x2";
-      break;
-    case 'd':
-      s += "float64";
-      if (scal)
-        break;
-      s += quad ? "x2" : "x1";
+      ElementBitwidth = 128;
+      // Poly doesn't have a 128x1 type.
+      if (Poly)
+        NumVectors = 0;
       break;
-
     default:
-      PrintFatalError("unhandled type!");
+      assert(0 && "Unhandled type code!");
+    }
   }
+  assert(ElementBitwidth != ~0U && "Bad element bitwidth!");
 
-  if (mod == '2' || mod == 'B')
-    s += "x2";
-  if (mod == '3' || mod == 'C')
-    s += "x3";
-  if (mod == '4' || mod == 'D')
-    s += "x4";
-
-  // Append _t, finishing the type string typedef type.
-  s += "_t";
-
-  if (cnst)
-    s += " const";
-
-  if (pntr)
-    s += " *";
-
-  return s.str();
+  Bitwidth = Quad ? 128 : 64;
 }
 
-/// BuiltinTypeString - for a modifier and type, generate the clang
-/// BuiltinsARM.def prototype code for the function.  See the top of clang's
-/// Builtins.def for a description of the type strings.
-static std::string BuiltinTypeString(const char mod, StringRef typestr,
-                                     ClassKind ck, bool ret) {
-  bool quad = false;
-  bool poly = false;
-  bool usgn = false;
-  bool scal = false;
-  bool cnst = false;
-  bool pntr = false;
-
-  if (mod == 'v')
-    return "v"; // void
-  if (mod == 'i')
-    return "i"; // int
-
-  // base type to get the type string for.
-  char type = ClassifyType(typestr, quad, poly, usgn);
-
-  // Based on the modifying character, change the type and width if necessary.
-  type = ModType(mod, type, quad, poly, usgn, scal, cnst, pntr);
-
-  usgn = usgn | poly | ((ck == ClassI || ck == ClassW) &&
-                         scal && type != 'f' && type != 'd');
-
-  // All pointers are void* pointers.  Change type to 'v' now.
-  if (pntr) {
-    usgn = false;
-    poly = false;
-    type = 'v';
-  }
-  // Treat half-float ('h') types as unsigned short ('s') types.
-  if (type == 'h') {
-    type = 's';
-    usgn = true;
-  }
-
-  if (scal) {
-    SmallString<128> s;
-
-    if (usgn)
-      s.push_back('U');
-    else if (type == 'c')
-      s.push_back('S'); // make chars explicitly signed
-
-    if (type == 'l') // 64-bit long
-      s += "Wi";
-    else if (type == 'k') // 128-bit long
-      s = "LLLi";
-    else
-      s.push_back(type);
-
-    if (cnst)
-      s.push_back('C');
-    if (pntr)
-      s.push_back('*');
-    return s.str();
-  }
-
-  // Since the return value must be one type, return a vector type of the
-  // appropriate width which we will bitcast.  An exception is made for
-  // returning structs of 2, 3, or 4 vectors which are returned in a sret-like
-  // fashion, storing them to a pointer arg.
-  if (ret) {
-    if (IsMultiVecProto(mod))
-      return "vv*"; // void result with void* first argument
-    if (mod == 'f' || (ck != ClassB && type == 'f'))
-      return quad ? "V4f" : "V2f";
-    if (mod == 'F' || (ck != ClassB && type == 'd'))
-      return quad ? "V2d" : "V1d";
-    if (ck != ClassB && type == 's')
-      return quad ? "V8s" : "V4s";
-    if (ck != ClassB && type == 'i')
-      return quad ? "V4i" : "V2i";
-    if (ck != ClassB && type == 'l')
-      return quad ? "V2Wi" : "V1Wi";
-
-    return quad ? "V16Sc" : "V8Sc";
-  }
-
-  // Non-return array types are passed as individual vectors.
-  if (mod == '2' || mod == 'B')
-    return quad ? "V16ScV16Sc" : "V8ScV8Sc";
-  if (mod == '3' || mod == 'C')
-    return quad ? "V16ScV16ScV16Sc" : "V8ScV8ScV8Sc";
-  if (mod == '4' || mod == 'D')
-    return quad ? "V16ScV16ScV16ScV16Sc" : "V8ScV8ScV8ScV8Sc";
-
-  if (mod == 'f' || (ck != ClassB && type == 'f'))
-    return quad ? "V4f" : "V2f";
-  if (mod == 'F' || (ck != ClassB && type == 'd'))
-    return quad ? "V2d" : "V1d";
-  if (ck != ClassB && type == 's')
-    return quad ? "V8s" : "V4s";
-  if (ck != ClassB && type == 'i')
-    return quad ? "V4i" : "V2i";
-  if (ck != ClassB && type == 'l')
-    return quad ? "V2Wi" : "V1Wi";
-
-  return quad ? "V16Sc" : "V8Sc";
-}
+void Type::applyModifier(char Mod) {
+  bool AppliedQuad = false;
+  applyTypespec(AppliedQuad);
 
-/// InstructionTypeCode - Computes the ARM argument character code and
-/// quad status for a specific type string and ClassKind.
-static void InstructionTypeCode(const StringRef &typeStr,
-                                const ClassKind ck,
-                                bool &quad,
-                                std::string &typeCode) {
-  bool poly = false;
-  bool usgn = false;
-  char type = ClassifyType(typeStr, quad, poly, usgn);
-
-  switch (type) {
-  case 'c':
-    switch (ck) {
-    case ClassS: typeCode = poly ? "p8" : usgn ? "u8" : "s8"; break;
-    case ClassI: typeCode = "i8"; break;
-    case ClassW: typeCode = "8"; break;
-    default: break;
-    }
+  switch (Mod) {
+  case 'v':
+    Void = true;
     break;
-  case 's':
-    switch (ck) {
-    case ClassS: typeCode = poly ? "p16" : usgn ? "u16" : "s16"; break;
-    case ClassI: typeCode = "i16"; break;
-    case ClassW: typeCode = "16"; break;
-    default: break;
+  case 't':
+    if (Poly) {
+      Poly = false;
+      Signed = false;
     }
     break;
+  case 'b':
+    Signed = false;
+    Float = false;
+    Poly = false;
+    NumVectors = 0;
+    Bitwidth = ElementBitwidth;
+    break;
+  case '$':
+    Signed = true;
+    Float = false;
+    Poly = false;
+    NumVectors = 0;
+    Bitwidth = ElementBitwidth;
+    break;
+  case 'u':
+    Signed = false;
+    Poly = false;
+    Float = false;
+    break;
+  case 'x':
+    Signed = true;
+    assert(!Poly && "'u' can't be used with poly types!");
+    Float = false;
+    break;
+  case 'o':
+    Bitwidth = ElementBitwidth = 64;
+    NumVectors = 0;
+    Float = true;
+    break;
+  case 'y':
+    Bitwidth = ElementBitwidth = 32;
+    NumVectors = 0;
+    Float = true;
+    break;
+  case 'f':
+    // Special case - if we're half-precision, a floating
+    // point argument needs to be 128-bits (double size).
+    if (isHalf())
+      Bitwidth = 128;
+    Float = true;
+    ElementBitwidth = 32;
+    break;
+  case 'F':
+    Float = true;
+    ElementBitwidth = 64;
+    break;
+  case 'g':
+    if (AppliedQuad)
+      Bitwidth /= 2;
+    break;
+  case 'j':
+    if (!AppliedQuad)
+      Bitwidth *= 2;
+    break;
+  case 'w':
+    ElementBitwidth *= 2;
+    Bitwidth *= 2;
+    break;
+  case 'n':
+    ElementBitwidth *= 2;
+    break;
   case 'i':
-    switch (ck) {
-    case ClassS: typeCode = usgn ? "u32" : "s32"; break;
-    case ClassI: typeCode = "i32"; break;
-    case ClassW: typeCode = "32"; break;
-    default: break;
-    }
+    Float = false;
+    Poly = false;
+    ElementBitwidth = Bitwidth = 32;
+    NumVectors = 0;
+    Signed = true;
     break;
   case 'l':
-    switch (ck) {
-    case ClassS: typeCode = poly ? "p64" : usgn ? "u64" : "s64"; break;
-    case ClassI: typeCode = "i64"; break;
-    case ClassW: typeCode = "64"; break;
-    default: break;
-    }
+    Float = false;
+    Poly = false;
+    ElementBitwidth = Bitwidth = 64;
+    NumVectors = 0;
+    Signed = false;
+    break;
+  case 'z':
+    ElementBitwidth /= 2;
+    Bitwidth = ElementBitwidth;
+    NumVectors = 0;
+    break;
+  case 'r':
+    ElementBitwidth *= 2;
+    Bitwidth = ElementBitwidth;
+    NumVectors = 0;
+    break;
+  case 's':
+  case 'a':
+    Bitwidth = ElementBitwidth;
+    NumVectors = 0;
     break;
   case 'k':
-    assert(poly && "Unrecognized 128 bit integer.");
-    typeCode = "p128";
+    Bitwidth *= 2;
+    break;
+  case 'c':
+    Constant = true;
+  // Fall through
+  case 'p':
+    Pointer = true;
+    Bitwidth = ElementBitwidth;
+    NumVectors = 0;
     break;
   case 'h':
-    switch (ck) {
-    case ClassS:
-    case ClassI: typeCode = "f16"; break;
-    case ClassW: typeCode = "16"; break;
-    default: break;
-    }
+    ElementBitwidth /= 2;
     break;
-  case 'f':
-    switch (ck) {
-    case ClassS:
-    case ClassI: typeCode = "f32"; break;
-    case ClassW: typeCode = "32"; break;
-    default: break;
-    }
+  case 'q':
+    ElementBitwidth /= 2;
+    Bitwidth *= 2;
+    break;
+  case 'e':
+    ElementBitwidth /= 2;
+    Signed = false;
+    break;
+  case 'm':
+    ElementBitwidth /= 2;
+    Bitwidth /= 2;
     break;
   case 'd':
-    switch (ck) {
-    case ClassS:
-    case ClassI:
-      typeCode += "f64";
-      break;
-    case ClassW:
-      PrintFatalError("unhandled type!");
-    default:
-      break;
-    }
+    break;
+  case '2':
+    NumVectors = 2;
+    break;
+  case '3':
+    NumVectors = 3;
+    break;
+  case '4':
+    NumVectors = 4;
+    break;
+  case 'B':
+    NumVectors = 2;
+    if (!AppliedQuad)
+      Bitwidth *= 2;
+    break;
+  case 'C':
+    NumVectors = 3;
+    if (!AppliedQuad)
+      Bitwidth *= 2;
+    break;
+  case 'D':
+    NumVectors = 4;
+    if (!AppliedQuad)
+      Bitwidth *= 2;
     break;
   default:
-    PrintFatalError("unhandled type!");
+    assert(0 && "Unhandled character!");
   }
 }
 
-static char Insert_BHSD_Suffix(StringRef typestr){
-  unsigned off = 0;
-  if(typestr[off++] == 'S'){
-    while(typestr[off] == 'Q' || typestr[off] == 'H'||
-          typestr[off] == 'P' || typestr[off] == 'U')
-      ++off;
-    switch (typestr[off]){
-    default  : break;
-    case 'c' : return 'b';
-    case 's' : return 'h';
-    case 'i' :
-    case 'f' : return 's';
-    case 'l' :
-    case 'd' : return 'd';
+//===----------------------------------------------------------------------===//
+// Intrinsic implementation
+//===----------------------------------------------------------------------===//
+
+std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) {
+  char typeCode = '\0';
+  bool printNumber = true;
+
+  if (CK == ClassB)
+    return "";
+
+  if (T.isPoly())
+    typeCode = 'p';
+  else if (T.isInteger())
+    typeCode = T.isSigned() ? 's' : 'u';
+  else
+    typeCode = 'f';
+
+  if (CK == ClassI) {
+    switch (typeCode) {
+    default:
+      break;
+    case 's':
+    case 'u':
+    case 'p':
+      typeCode = 'i';
+      break;
     }
   }
-  return 0;
-}
-
-static bool endsWith_xN(std::string const &name) {
-  if (name.length() > 3) {
-    if (name.compare(name.length() - 3, 3, "_x2") == 0 ||
-        name.compare(name.length() - 3, 3, "_x3") == 0 ||
-        name.compare(name.length() - 3, 3, "_x4") == 0)
-      return true;
+  if (CK == ClassB) {
+    typeCode = '\0';
   }
-  return false;
+
+  std::string S;
+  if (typeCode != '\0')
+    S.push_back(typeCode);
+  if (printNumber)
+    S += utostr(T.getElementSizeInBits());
+
+  return S;
 }
 
-/// MangleName - Append a type or width suffix to a base neon function name,
-/// and insert a 'q' in the appropriate location if type string starts with 'Q'.
-/// E.g. turn "vst2_lane" into "vst2q_lane_f32", etc.
-/// Insert proper 'b' 'h' 's' 'd' if prefix 'S' is used.
-static std::string MangleName(const std::string &name, StringRef typestr,
-                              ClassKind ck) {
-  if (name == "vcvt_f32_f16" || name == "vcvt_f32_f64" ||
-      name == "vcvt_f64_f32")
-    return name;
+std::string Intrinsic::getBuiltinTypeStr() {
+  ClassKind LocalCK = getClassKind(true);
+  std::string S;
 
-  bool quad = false;
-  std::string typeCode = "";
+  Type RetT = getReturnType();
+  if ((LocalCK == ClassI || LocalCK == ClassW) && RetT.isScalar() &&
+      !RetT.isFloating())
+    RetT.makeInteger(RetT.getElementSizeInBits(), false);
 
-  InstructionTypeCode(typestr, ck, quad, typeCode);
+  // Since the return value must be one type, return a vector type of the
+  // appropriate width which we will bitcast.  An exception is made for
+  // returning structs of 2, 3, or 4 vectors which are returned in a sret-like
+  // fashion, storing them to a pointer arg.
+  if (RetT.getNumVectors() > 1) {
+    S += "vv*"; // void result with void* first argument
+  } else {
+    if (RetT.isPoly())
+      RetT.makeInteger(RetT.getElementSizeInBits(), false);
+    if (!RetT.isScalar() && !RetT.isSigned())
+      RetT.makeSigned();
 
-  std::string s = name;
+    bool ForcedVectorFloatingType = Proto[0] == 'F' || Proto[0] == 'f';
+    if (LocalCK == ClassB && !RetT.isScalar() && !ForcedVectorFloatingType)
+      // Cast to vector of 8-bit elements.
+      RetT.makeInteger(8, true);
 
-  if (typeCode.size() > 0) {
-    // If the name is end with _xN (N = 2,3,4), insert the typeCode before _xN.
-    if (endsWith_xN(s))
-      s.insert(s.length() - 3, "_" + typeCode);
-    else
-      s += "_" + typeCode;
+    S += RetT.builtin_str();
   }
 
-  if (ck == ClassB)
-    s += "_v";
+  for (unsigned I = 0; I < getNumParams(); ++I) {
+    Type T = getParamType(I);
+    if (T.isPoly())
+      T.makeInteger(T.getElementSizeInBits(), false);
 
-  // Insert a 'q' before the first '_' character so that it ends up before
-  // _lane or _n on vector-scalar operations.
-  if (typestr.find("Q") != StringRef::npos) {
-      size_t pos = s.find('_');
-      s = s.insert(pos, "q");
-  }
-  char ins = Insert_BHSD_Suffix(typestr);
-  if(ins){
-    size_t pos = s.find('_');
-    s = s.insert(pos, &ins, 1);
-  }
+    bool ForcedFloatingType = Proto[I + 1] == 'F' || Proto[I + 1] == 'f';
+    if (LocalCK == ClassB && !T.isScalar() && !ForcedFloatingType)
+      T.makeInteger(8, true);
+    // Halves always get converted to 8-bit elements.
+    if (T.isHalf() && T.isVector() && !T.isScalarForMangling())
+      T.makeInteger(8, true);
 
-  return s;
-}
-
-static void PreprocessInstruction(const StringRef &Name,
-                                  const std::string &InstName,
-                                  std::string &Prefix,
-                                  bool &HasNPostfix,
-                                  bool &HasLanePostfix,
-                                  bool &HasDupPostfix,
-                                  bool &IsSpecialVCvt,
-                                  size_t &TBNumber) {
-  // All of our instruction name fields from arm_neon.td are of the form
-  //   <instructionname>_...
-  // Thus we grab our instruction name via computation of said Prefix.
-  const size_t PrefixEnd = Name.find_first_of('_');
-  // If InstName is passed in, we use that instead of our name Prefix.
-  Prefix = InstName.size() == 0? Name.slice(0, PrefixEnd).str() : InstName;
-
-  const StringRef Postfix = Name.slice(PrefixEnd, Name.size());
-
-  HasNPostfix = Postfix.count("_n");
-  HasLanePostfix = Postfix.count("_lane");
-  HasDupPostfix = Postfix.count("_dup");
-  IsSpecialVCvt = Postfix.size() != 0 && Name.count("vcvt");
-
-  if (InstName.compare("vtbl") == 0 ||
-      InstName.compare("vtbx") == 0) {
-    // If we have a vtblN/vtbxN instruction, use the instruction's ASCII
-    // encoding to get its true value.
-    TBNumber = Name[Name.size()-1] - 48;
-  }
-}
-
-/// GenerateRegisterCheckPatternsForLoadStores - Given a bunch of data we have
-/// extracted, generate a FileCheck pattern for a Load Or Store
-static void
-GenerateRegisterCheckPatternForLoadStores(const StringRef &NameRef,
-                                          const std::string& OutTypeCode,
-                                          const bool &IsQuad,
-                                          const bool &HasDupPostfix,
-                                          const bool &HasLanePostfix,
-                                          const size_t Count,
-                                          std::string &RegisterSuffix) {
-  const bool IsLDSTOne = NameRef.count("vld1") || NameRef.count("vst1");
-  // If N == 3 || N == 4 and we are dealing with a quad instruction, Clang
-  // will output a series of v{ld,st}1s, so we have to handle it specially.
-  if ((Count == 3 || Count == 4) && IsQuad) {
-    RegisterSuffix += "{";
-    for (size_t i = 0; i < Count; i++) {
-      RegisterSuffix += "d{{[0-9]+}}";
-      if (HasDupPostfix) {
-        RegisterSuffix += "[]";
-      }
-      if (HasLanePostfix) {
-        RegisterSuffix += "[{{[0-9]+}}]";
-      }
-      if (i < Count-1) {
-        RegisterSuffix += ", ";
-      }
-    }
-    RegisterSuffix += "}";
-  } else {
-
-    // Handle normal loads and stores.
-    RegisterSuffix += "{";
-    for (size_t i = 0; i < Count; i++) {
-      RegisterSuffix += "d{{[0-9]+}}";
-      if (HasDupPostfix) {
-        RegisterSuffix += "[]";
-      }
-      if (HasLanePostfix) {
-        RegisterSuffix += "[{{[0-9]+}}]";
-      }
-      if (IsQuad && !HasLanePostfix) {
-        RegisterSuffix += ", d{{[0-9]+}}";
-        if (HasDupPostfix) {
-          RegisterSuffix += "[]";
-        }
-      }
-      if (i < Count-1) {
-        RegisterSuffix += ", ";
-      }
-    }
-    RegisterSuffix += "}, [r{{[0-9]+}}";
+    if (LocalCK == ClassI)
+      T.makeSigned();
 
-    // We only include the alignment hint if we have a vld1.*64 or
-    // a dup/lane instruction.
-    if (IsLDSTOne) {
-      if ((HasLanePostfix || HasDupPostfix) && OutTypeCode != "8") {
-        RegisterSuffix += ":" + OutTypeCode;
-      }
-    }
+    // Constant indices are always just "int".
+    if (hasImmediate() && getImmediateIdx() == I)
+      T.makeInteger(32, true);
 
-    RegisterSuffix += "]";
+    S += T.builtin_str();
   }
-}
-
-static bool HasNPostfixAndScalarArgs(const StringRef &NameRef,
-                                     const bool &HasNPostfix) {
-  return (NameRef.count("vmla") ||
-          NameRef.count("vmlal") ||
-          NameRef.count("vmlsl") ||
-          NameRef.count("vmull") ||
-          NameRef.count("vqdmlal") ||
-          NameRef.count("vqdmlsl") ||
-          NameRef.count("vqdmulh") ||
-          NameRef.count("vqdmull") ||
-          NameRef.count("vqrdmulh")) && HasNPostfix;
-}
 
-static bool IsFiveOperandLaneAccumulator(const StringRef &NameRef,
-                                         const bool &HasLanePostfix) {
-  return (NameRef.count("vmla") ||
-          NameRef.count("vmls") ||
-          NameRef.count("vmlal") ||
-          NameRef.count("vmlsl") ||
-          (NameRef.count("vmul") && NameRef.size() == 3)||
-          NameRef.count("vqdmlal") ||
-          NameRef.count("vqdmlsl") ||
-          NameRef.count("vqdmulh") ||
-          NameRef.count("vqrdmulh")) && HasLanePostfix;
-}
+  // Extra constant integer to hold type class enum for this function, e.g. s8
+  if (LocalCK == ClassB)
+    S += "i";
 
-static bool IsSpecialLaneMultiply(const StringRef &NameRef,
-                                  const bool &HasLanePostfix,
-                                  const bool &IsQuad) {
-  const bool IsVMulOrMulh = (NameRef.count("vmul") || NameRef.count("mulh"))
-                               && IsQuad;
-  const bool IsVMull = NameRef.count("mull") && !IsQuad;
-  return (IsVMulOrMulh || IsVMull) && HasLanePostfix;
+  return S;
 }
 
-static void NormalizeProtoForRegisterPatternCreation(const std::string &Name,
-                                                     const std::string &Proto,
-                                                     const bool &HasNPostfix,
-                                                     const bool &IsQuad,
-                                                     const bool &HasLanePostfix,
-                                                     const bool &HasDupPostfix,
-                                                     std::string &NormedProto) {
-  // Handle generic case.
-  const StringRef NameRef(Name);
-  for (size_t i = 0, end = Proto.size(); i < end; i++) {
-    switch (Proto[i]) {
-    case 'u':
-    case 'f':
-    case 'F':
-    case 'd':
-    case 's':
-    case 'x':
-    case 't':
-    case 'n':
-      NormedProto += IsQuad? 'q' : 'd';
-      break;
-    case 'w':
-    case 'k':
-      NormedProto += 'q';
-      break;
-    case 'g':
-    case 'j':
-    case 'h':
-    case 'e':
-      NormedProto += 'd';
-      break;
-    case 'i':
-      NormedProto += HasLanePostfix? 'a' : 'i';
-      break;
-    case 'a':
-      if (HasLanePostfix) {
-        NormedProto += 'a';
-      } else if (HasNPostfixAndScalarArgs(NameRef, HasNPostfix)) {
-        NormedProto += IsQuad? 'q' : 'd';
-      } else {
-        NormedProto += 'i';
-      }
-      break;
-    }
-  }
+std::string Intrinsic::getMangledName(bool ForceClassS) {
+  // Check if the prototype has a scalar operand with the type of the vector
+  // elements.  If not, bitcasting the args will take care of arg checking.
+  // The actual signedness etc. will be taken care of with special enums.
+  ClassKind LocalCK = CK;
+  if (!protoHasScalar())
+    LocalCK = ClassB;
 
-  // Handle Special Cases.
-  const bool IsNotVExt = !NameRef.count("vext");
-  const bool IsVPADAL = NameRef.count("vpadal");
-  const bool Is5OpLaneAccum = IsFiveOperandLaneAccumulator(NameRef,
-                                                           HasLanePostfix);
-  const bool IsSpecialLaneMul = IsSpecialLaneMultiply(NameRef, HasLanePostfix,
-                                                      IsQuad);
-
-  if (IsSpecialLaneMul) {
-    // If
-    NormedProto[2] = NormedProto[3];
-    NormedProto.erase(3);
-  } else if (NormedProto.size() == 4 &&
-             NormedProto[0] == NormedProto[1] &&
-             IsNotVExt) {
-    // If NormedProto.size() == 4 and the first two proto characters are the
-    // same, ignore the first.
-    NormedProto = NormedProto.substr(1, 3);
-  } else if (Is5OpLaneAccum) {
-    // If we have a 5 op lane accumulator operation, we take characters 1,2,4
-    std::string tmp = NormedProto.substr(1,2);
-    tmp += NormedProto[4];
-    NormedProto = tmp;
-  } else if (IsVPADAL) {
-    // If we have VPADAL, ignore the first character.
-    NormedProto = NormedProto.substr(0, 2);
-  } else if (NameRef.count("vdup") && NormedProto.size() > 2) {
-    // If our instruction is a dup instruction, keep only the first and
-    // last characters.
-    std::string tmp = "";
-    tmp += NormedProto[0];
-    tmp += NormedProto[NormedProto.size()-1];
-    NormedProto = tmp;
-  }
+  return mangleName(Name, ForceClassS ? ClassS : LocalCK);
 }
 
-/// GenerateRegisterCheckPatterns - Given a bunch of data we have
-/// extracted, generate a FileCheck pattern to check that an
-/// instruction's arguments are correct.
-static void GenerateRegisterCheckPattern(const std::string &Name,
-                                         const std::string &Proto,
-                                         const std::string &OutTypeCode,
-                                         const bool &HasNPostfix,
-                                         const bool &IsQuad,
-                                         const bool &HasLanePostfix,
-                                         const bool &HasDupPostfix,
-                                         const size_t &TBNumber,
-                                         std::string &RegisterSuffix) {
+std::string Intrinsic::mangleName(std::string Name, ClassKind LocalCK) {
+  std::string typeCode = getInstTypeCode(BaseType, LocalCK);
+  std::string S = Name;
 
-  RegisterSuffix = "";
+  if (Name == "vcvt_f32_f16" || Name == "vcvt_f32_f64" ||
+      Name == "vcvt_f64_f32")
+    return Name;
 
-  const StringRef NameRef(Name);
-
-  if ((NameRef.count("vdup") || NameRef.count("vmov")) && HasNPostfix) {
-    return;
+  if (typeCode.size() > 0) {
+    // If the name ends with _xN (N = 2,3,4), insert the typeCode before _xN.
+    if (Name.size() >= 3 && isdigit(Name.back()) &&
+        Name[Name.length() - 2] == 'x' && Name[Name.length() - 3] == '_')
+      S.insert(S.length() - 3, "_" + typeCode);
+    else
+      S += "_" + typeCode;
   }
 
-  const bool IsLoadStore = NameRef.count("vld") || NameRef.count("vst");
-  const bool IsTBXOrTBL = NameRef.count("vtbl") || NameRef.count("vtbx");
-
-  if (IsLoadStore) {
-    // Grab N value from  v{ld,st}N using its ascii representation.
-    const size_t Count = NameRef[3] - 48;
-
-    GenerateRegisterCheckPatternForLoadStores(NameRef, OutTypeCode, IsQuad,
-                                              HasDupPostfix, HasLanePostfix,
-                                              Count, RegisterSuffix);
-  } else if (IsTBXOrTBL) {
-    RegisterSuffix += "d{{[0-9]+}}, {";
-    for (size_t i = 0; i < TBNumber-1; i++) {
-      RegisterSuffix += "d{{[0-9]+}}, ";
-    }
-    RegisterSuffix += "d{{[0-9]+}}}, d{{[0-9]+}}";
-  } else {
-    // Handle a normal instruction.
-    if (NameRef.count("vget") || NameRef.count("vset"))
-      return;
-
-    // We first normalize our proto, since we only need to emit 4
-    // different types of checks, yet have more than 4 proto types
-    // that map onto those 4 patterns.
-    std::string NormalizedProto("");
-    NormalizeProtoForRegisterPatternCreation(Name, Proto, HasNPostfix, IsQuad,
-                                             HasLanePostfix, HasDupPostfix,
-                                             NormalizedProto);
-
-    for (size_t i = 0, end = NormalizedProto.size(); i < end; i++) {
-      const char &c = NormalizedProto[i];
-      switch (c) {
-      case 'q':
-        RegisterSuffix += "q{{[0-9]+}}, ";
-        break;
-
-      case 'd':
-        RegisterSuffix += "d{{[0-9]+}}, ";
-        break;
+  if (BaseType != InBaseType) {
+    // A reinterpret - out the input base type at the end.
+    S += "_" + getInstTypeCode(InBaseType, LocalCK);
+  }
 
-      case 'i':
-        RegisterSuffix += "#{{[0-9]+}}, ";
-        break;
+  if (LocalCK == ClassB)
+    S += "_v";
 
-      case 'a':
-        RegisterSuffix += "d{{[0-9]+}}[{{[0-9]}}], ";
-        break;
-      }
+  // Insert a 'q' before the first '_' character so that it ends up before
+  // _lane or _n on vector-scalar operations.
+  if (BaseType.getSizeInBits() == 128 && !BaseType.noManglingQ()) {
+    size_t Pos = S.find('_');
+    S.insert(Pos, "q");
+  }
+
+  char Suffix = '\0';
+  if (BaseType.isScalarForMangling()) {
+    switch (BaseType.getElementSizeInBits()) {
+    case 8: Suffix = 'b'; break;
+    case 16: Suffix = 'h'; break;
+    case 32: Suffix = 's'; break;
+    case 64: Suffix = 'd'; break;
+    default: assert(0 && "Bad suffix!");
     }
-
-    // Remove extra ", ".
-    RegisterSuffix = RegisterSuffix.substr(0, RegisterSuffix.size()-2);
   }
-}
-
-/// GenerateChecksForIntrinsic - Given a specific instruction name +
-/// typestr + class kind, generate the proper set of FileCheck
-/// Patterns to check for. We could just return a string, but instead
-/// use a vector since it provides us with the extra flexibility of
-/// emitting multiple checks, which comes in handy for certain cases
-/// like mla where we want to check for 2 different instructions.
-static void GenerateChecksForIntrinsic(const std::string &Name,
-                                       const std::string &Proto,
-                                       StringRef &OutTypeStr,
-                                       StringRef &InTypeStr,
-                                       ClassKind Ck,
-                                       const std::string &InstName,
-                                       bool IsHiddenLOp,
-                                       std::vector<std::string>& Result) {
-
-  // If Ck is a ClassNoTest instruction, just return so no test is
-  // emitted.
-  if(Ck == ClassNoTest)
-    return;
-
-  if (Name == "vcvt_f32_f16") {
-    Result.push_back("vcvt.f32.f16");
-    return;
+  if (Suffix != '\0') {
+    size_t Pos = S.find('_');
+    S.insert(Pos, &Suffix, 1);
   }
 
+  return S;
+}
 
-  // Now we preprocess our instruction given the data we have to get the
-  // data that we need.
-  // Create a StringRef for String Manipulation of our Name.
-  const StringRef NameRef(Name);
-  // Instruction Prefix.
-  std::string Prefix;
-  // The type code for our out type string.
-  std::string OutTypeCode;
-  // To handle our different cases, we need to check for different postfixes.
-  // Is our instruction a quad instruction.
-  bool IsQuad = false;
-  // Our instruction is of the form <instructionname>_n.
-  bool HasNPostfix = false;
-  // Our instruction is of the form <instructionname>_lane.
-  bool HasLanePostfix = false;
-  // Our instruction is of the form <instructionname>_dup.
-  bool HasDupPostfix  = false;
-  // Our instruction is a vcvt instruction which requires special handling.
-  bool IsSpecialVCvt = false;
-  // If we have a vtbxN or vtblN instruction, this is set to N.
-  size_t TBNumber = -1;
-  // Register Suffix
-  std::string RegisterSuffix;
-
-  PreprocessInstruction(NameRef, InstName, Prefix,
-                        HasNPostfix, HasLanePostfix, HasDupPostfix,
-                        IsSpecialVCvt, TBNumber);
-
-  InstructionTypeCode(OutTypeStr, Ck, IsQuad, OutTypeCode);
-  GenerateRegisterCheckPattern(Name, Proto, OutTypeCode, HasNPostfix, IsQuad,
-                               HasLanePostfix, HasDupPostfix, TBNumber,
-                               RegisterSuffix);
-
-  // In the following section, we handle a bunch of special cases. You can tell
-  // a special case by the fact we are returning early.
-
-  // If our instruction is a logical instruction without postfix or a
-  // hidden LOp just return the current Prefix.
-  if (Ck == ClassL || IsHiddenLOp) {
-    Result.push_back(Prefix + " " + RegisterSuffix);
-    return;
-  }
+std::string Intrinsic::replaceParamsIn(std::string S) {
+  while (S.find('$') != std::string::npos) {
+    size_t Pos = S.find('$');
+    size_t End = Pos + 1;
+    while (isalpha(S[End]))
+      ++End;
 
-  // If we have a vmov, due to the many different cases, some of which
-  // vary within the different intrinsics generated for a single
-  // instruction type, just output a vmov. (e.g. given an instruction
-  // A, A.u32 might be vmov and A.u8 might be vmov.8).
-  //
-  // FIXME: Maybe something can be done about this. The two cases that we care
-  // about are vmov as an LType and vmov as a WType.
-  if (Prefix == "vmov") {
-    Result.push_back(Prefix + " " + RegisterSuffix);
-    return;
+    std::string VarName = S.substr(Pos + 1, End - Pos - 1);
+    assert_with_loc(Variables.find(VarName) != Variables.end(),
+                    "Variable not defined!");
+    S.replace(Pos, End - Pos, Variables.find(VarName)->second.getName());
   }
 
-  // In the following section, we handle special cases.
+  return S;
+}
 
-  if (OutTypeCode == "64") {
-    // If we have a 64 bit vdup/vext and are handling an uint64x1_t
-    // type, the intrinsic will be optimized away, so just return
-    // nothing.  On the other hand if we are handling an uint64x2_t
-    // (i.e. quad instruction), vdup/vmov instructions should be
-    // emitted.
-    if (Prefix == "vdup" || Prefix == "vext") {
-      if (IsQuad) {
-        Result.push_back("{{vmov|vdup}}");
-      }
-      return;
-    }
+void Intrinsic::initVariables() {
+  Variables.clear();
 
-    // v{st,ld}{2,3,4}_{u,s}64 emit v{st,ld}1.64 instructions with
-    // multiple register operands.
-    bool MultiLoadPrefix = Prefix == "vld2" || Prefix == "vld3"
-                            || Prefix == "vld4";
-    bool MultiStorePrefix = Prefix == "vst2" || Prefix == "vst3"
-                            || Prefix == "vst4";
-    if (MultiLoadPrefix || MultiStorePrefix) {
-      Result.push_back(NameRef.slice(0, 3).str() + "1.64");
-      return;
-    }
+  // Modify the TypeSpec per-argument to get a concrete Type, and create
+  // known variables for each.
+  for (unsigned I = 1; I < Proto.size(); ++I) {
+    char NameC = '0' + (I - 1);
+    std::string Name = "p";
+    Name.push_back(NameC);
 
-    // v{st,ld}1_{lane,dup}_{u64,s64} use vldr/vstr/vmov/str instead of
-    // emitting said instructions. So return a check for
-    // vldr/vstr/vmov/str instead.
-    if (HasLanePostfix || HasDupPostfix) {
-      if (Prefix == "vst1") {
-        Result.push_back("{{str|vstr|vmov}}");
-        return;
-      } else if (Prefix == "vld1") {
-        Result.push_back("{{ldr|vldr|vmov}}");
-        return;
-      }
-    }
+    Variables[Name] = Variable(Types[I], Name + VariablePostfix);
   }
+  RetVar = Variable(Types[0], "ret" + VariablePostfix);
+}
 
-  // vzip.32/vuzp.32 are the same instruction as vtrn.32 and are
-  // sometimes disassembled as vtrn.32. We use a regex to handle both
-  // cases.
-  if ((Prefix == "vzip" || Prefix == "vuzp") && OutTypeCode == "32") {
-    Result.push_back("{{vtrn|" + Prefix + "}}.32 " + RegisterSuffix);
-    return;
-  }
+void Intrinsic::emitPrototype() {
+  if (UseMacro)
+    OS << "#define ";
+  else
+    OS << "__ai " << Types[0].str() << " ";
 
-  // Currently on most ARM processors, we do not use vmla/vmls for
-  // quad floating point operations. Instead we output vmul + vadd. So
-  // check if we have one of those instructions and just output a
-  // check for vmul.
-  if (OutTypeCode == "f32") {
-    if (Prefix == "vmls") {
-      Result.push_back("vmul." + OutTypeCode + " " + RegisterSuffix);
-      Result.push_back("vsub." + OutTypeCode);
-      return;
-    } else if (Prefix == "vmla") {
-      Result.push_back("vmul." + OutTypeCode + " " + RegisterSuffix);
-      Result.push_back("vadd." + OutTypeCode);
-      return;
-    }
-  }
+  OS << mangleName(Name, ClassS) << "(";
 
-  // If we have vcvt, get the input type from the instruction name
-  // (which should be of the form instname_inputtype) and append it
-  // before the output type.
-  if (Prefix == "vcvt") {
-    const std::string inTypeCode = NameRef.substr(NameRef.find_last_of("_")+1);
-    Prefix += "." + inTypeCode;
-  }
+  for (unsigned I = 0; I < getNumParams(); ++I) {
+    if (I != 0)
+      OS << ", ";
 
-  // Append output type code to get our final mangled instruction.
-  Prefix += "." + OutTypeCode;
+    char NameC = '0' + I;
+    std::string Name = "p";
+    Name.push_back(NameC);
+    assert(Variables.find(Name) != Variables.end());
+    Variable &V = Variables[Name];
 
-  Result.push_back(Prefix + " " + RegisterSuffix);
-}
+    if (!UseMacro)
+      OS << V.getType().str() << " ";
+    OS << V.getName();
+  }
 
-/// UseMacro - Examine the prototype string to determine if the intrinsic
-/// should be defined as a preprocessor macro instead of an inline function.
-static bool UseMacro(const std::string &proto, StringRef typestr) {
-  // If this builtin takes an immediate argument, we need to #define it rather
-  // than use a standard declaration, so that SemaChecking can range check
-  // the immediate passed by the user.
-  if (proto.find('i') != std::string::npos)
-    return true;
-
-  // Pointer arguments need to use macros to avoid hiding aligned attributes
-  // from the pointer type.
-  if (proto.find('p') != std::string::npos ||
-      proto.find('c') != std::string::npos)
-    return true;
-
-  // It is not permitted to pass or return an __fp16 by value, so intrinsics
-  // taking a scalar float16_t must be implemented as macros.
-  if (typestr.find('h') != std::string::npos &&
-      proto.find('s') != std::string::npos)
-    return true;
-
-  return false;
+  OS << ")";
 }
 
-/// MacroArgUsedDirectly - Return true if argument i for an intrinsic that is
-/// defined as a macro should be accessed directly instead of being first
-/// assigned to a local temporary.
-static bool MacroArgUsedDirectly(const std::string &proto, unsigned i) {
-  // True for constant ints (i), pointers (p) and const pointers (c).
-  return (proto[i] == 'i' || proto[i] == 'p' || proto[i] == 'c');
+void Intrinsic::emitOpeningBrace() {
+  if (UseMacro)
+    OS << " __extension__ ({";
+  else
+    OS << " {";
+  emitNewLine();
 }
 
-// Generate the string "(argtype a, argtype b, ...)"
-static std::string GenArgs(const std::string &proto, StringRef typestr,
-                           const std::string &name) {
-  bool define = UseMacro(proto, typestr);
-  char arg = 'a';
-
-  std::string s;
-  s += "(";
-
-  for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
-    if (define) {
-      // Some macro arguments are used directly instead of being assigned
-      // to local temporaries; prepend an underscore prefix to make their
-      // names consistent with the local temporaries.
-      if (MacroArgUsedDirectly(proto, i))
-        s += "__";
-    } else {
-      s += TypeString(proto[i], typestr) + " __";
-    }
-    s.push_back(arg);
-    if ((i + 1) < e)
-      s += ", ";
-  }
+void Intrinsic::emitClosingBrace() {
+  if (UseMacro)
+    OS << "})";
+  else
+    OS << "}";
+}
 
-  s += ")";
-  return s;
+void Intrinsic::emitNewLine() {
+  if (UseMacro)
+    OS << " \\\n";
+  else
+    OS << "\n";
 }
 
-// Macro arguments are not type-checked like inline function arguments, so
-// assign them to local temporaries to get the right type checking.
-static std::string GenMacroLocals(const std::string &proto, StringRef typestr,
-                                  const std::string &name ) {
-  char arg = 'a';
-  std::string s;
-  bool generatedLocal = false;
+void Intrinsic::emitShadowedArgs() {
+  // Macro arguments are not type-checked like inline function arguments,
+  // so assign them to local temporaries to get the right type checking.
+  if (!UseMacro)
+    return;
 
-  for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
+  for (unsigned I = 0; I < getNumParams(); ++I) {
     // Do not create a temporary for an immediate argument.
     // That would defeat the whole point of using a macro!
-    if (MacroArgUsedDirectly(proto, i))
+    if (hasImmediate() && Proto[I+1] == 'i')
+      continue;
+    // Do not create a temporary for pointer arguments. The input
+    // pointer may have an alignment hint.
+    if (getParamType(I).isPointer())
       continue;
-    generatedLocal = true;
-    s += TypeString(proto[i], typestr) + " __";
-    s.push_back(arg);
-    s += " = (";
-    s.push_back(arg);
-    s += "); ";
-  }
-
-  if (generatedLocal)
-    s += "\\\n  ";
-  return s;
-}
-
-// Use the vmovl builtin to sign-extend or zero-extend a vector.
-static std::string Extend(StringRef typestr, const std::string &a, bool h=0) {
-  std::string s, high;
-  high = h ? "_high" : "";
-  s = MangleName("vmovl" + high, typestr, ClassS);
-  s += "(" + a + ")";
-  return s;
-}
-
-// Get the high 64-bit part of a vector
-static std::string GetHigh(const std::string &a, StringRef typestr) {
-  std::string s;
-  s = MangleName("vget_high", typestr, ClassS);
-  s += "(" + a + ")";
-  return s;
-}
 
-// Gen operation with two operands and get high 64-bit for both of two operands.
-static std::string Gen2OpWith2High(StringRef typestr,
-                                   const std::string &op,
-                                   const std::string &a,
-                                   const std::string &b) {
-  std::string s;
-  std::string Op1 = GetHigh(a, typestr);
-  std::string Op2 = GetHigh(b, typestr);
-  s = MangleName(op, typestr, ClassS);
-  s += "(" + Op1 + ", " + Op2 + ");";
-  return s;
-}
+    char NameC = '0' + I;
+    std::string Name = "p";
+    Name.push_back(NameC);
 
-// Gen operation with three operands and get high 64-bit of the latter 
-// two operands.
-static std::string Gen3OpWith2High(StringRef typestr,
-                                   const std::string &op,
-                                   const std::string &a,
-                                   const std::string &b,
-                                   const std::string &c) {
-  std::string s;
-  std::string Op1 = GetHigh(b, typestr);
-  std::string Op2 = GetHigh(c, typestr);
-  s = MangleName(op, typestr, ClassS);
-  s += "(" + a + ", " + Op1 + ", " + Op2 + ");";
-  return s;
-}
+    assert(Variables.find(Name) != Variables.end());
+    Variable &V = Variables[Name];
 
-// Gen combine operation by putting a on low 64-bit, and b on high 64-bit.
-static std::string GenCombine(std::string typestr,
-                              const std::string &a,
-                              const std::string &b) {
-  std::string s;
-  s = MangleName("vcombine", typestr, ClassS);
-  s += "(" + a + ", " + b + ")";
-  return s;
-}
+    std::string NewName = "s" + utostr(I);
+    Variable V2(V.getType(), NewName + VariablePostfix);
 
-static std::string Duplicate(unsigned nElts, StringRef typestr,
-                             const std::string &a) {
-  std::string s;
+    OS << "  " << V2.getType().str() << " " << V2.getName() << " = "
+       << V.getName() << ";";
+    emitNewLine();
 
-  s = "(" + TypeString('d', typestr) + "){ ";
-  for (unsigned i = 0; i != nElts; ++i) {
-    s += a;
-    if ((i + 1) < nElts)
-      s += ", ";
+    V = V2;
   }
-  s += " }";
-
-  return s;
 }
 
-static std::string SplatLane(unsigned nElts, const std::string &vec,
-                             const std::string &lane) {
-  std::string s = "__builtin_shufflevector(" + vec + ", " + vec;
-  for (unsigned i = 0; i < nElts; ++i)
-    s += ", " + lane;
-  s += ")";
-  return s;
+// We don't check 'a' in this function, because for builtin function the
+// argument matching to 'a' uses a vector type splatted from a scalar type.
+bool Intrinsic::protoHasScalar() {
+  return (Proto.find('s') != std::string::npos ||
+          Proto.find('z') != std::string::npos ||
+          Proto.find('r') != std::string::npos ||
+          Proto.find('b') != std::string::npos ||
+          Proto.find('$') != std::string::npos ||
+          Proto.find('y') != std::string::npos ||
+          Proto.find('o') != std::string::npos);
 }
 
-static std::string RemoveHigh(const std::string &name) {
-  std::string s = name;
-  std::size_t found = s.find("_high_");
-  if (found == std::string::npos)
-    PrintFatalError("name should contain \"_high_\" for high intrinsics");
-  s.replace(found, 5, "");
-  return s;
-}
+void Intrinsic::emitBodyAsBuiltinCall() {
+  std::string S;
 
-static unsigned GetNumElements(StringRef typestr, bool &quad) {
-  quad = false;
-  bool dummy = false;
-  char type = ClassifyType(typestr, quad, dummy, dummy);
-  unsigned nElts = 0;
-  switch (type) {
-  case 'c': nElts = 8; break;
-  case 's': nElts = 4; break;
-  case 'i': nElts = 2; break;
-  case 'l': nElts = 1; break;
-  case 'k': nElts = 1; break;
-  case 'h': nElts = 4; break;
-  case 'f': nElts = 2; break;
-  case 'd':
-    nElts = 1;
-    break;
-  default:
-    PrintFatalError("unhandled type!");
-  }
-  if (quad) nElts <<= 1;
-  return nElts;
-}
+  // If this builtin returns a struct 2, 3, or 4 vectors, pass it as an implicit
+  // sret-like argument.
+  bool SRet = getReturnType().getNumVectors() >= 2;
 
-// Generate the definition for this intrinsic, e.g. "a + b" for OpAdd.
-//
-// Note that some intrinsic definitions around 'lane' are being implemented
-// with macros, because they all contain constant integer argument, and we
-// statically check the range of the lane index to meet the semantic
-// requirement of different intrinsics.
-//
-// For the intrinsics implemented with macro, if they contain another intrinsic
-// implemented with maco, we have to avoid using the same argument names for
-// the nested instrinsics. For example, macro vfms_lane is being implemented
-// with another macor vfma_lane, so we rename all arguments for vfms_lane by
-// adding a suffix '1'.
-
-static std::string GenOpString(const std::string &name, OpKind op,
-                               const std::string &proto, StringRef typestr) {
-  bool quad;
-  unsigned nElts = GetNumElements(typestr, quad);
-  bool define = UseMacro(proto, typestr);
-
-  std::string ts = TypeString(proto[0], typestr);
-  std::string s;
-  if (!define) {
-    s = "return ";
-  }
-
-  switch(op) {
-  case OpAdd:
-    s += "__a + __b;";
-    break;
-  case OpAddl:
-    s += Extend(typestr, "__a") + " + " + Extend(typestr, "__b") + ";";
-    break;
-  case OpAddlHi:
-    s += Extend(typestr, "__a", 1) + " + " + Extend(typestr, "__b", 1) + ";";
-    break;
-  case OpAddw:
-    s += "__a + " + Extend(typestr, "__b") + ";";
-    break;
-  case OpAddwHi:
-    s += "__a + " + Extend(typestr, "__b", 1) + ";";
-    break;
-  case OpSub:
-    s += "__a - __b;";
-    break;
-  case OpSubl:
-    s += Extend(typestr, "__a") + " - " + Extend(typestr, "__b") + ";";
-    break;
-  case OpSublHi:
-    s += Extend(typestr, "__a", 1) + " - " + Extend(typestr, "__b", 1) + ";";
-    break;
-  case OpSubw:
-    s += "__a - " + Extend(typestr, "__b") + ";";
-    break;
-  case OpSubwHi:
-    s += "__a - " + Extend(typestr, "__b", 1) + ";";
-    break;
-  case OpMulN:
-    s += "__a * " + Duplicate(nElts, typestr, "__b") + ";";
-    break;
-  case OpMulLane:
-    s += "__a * " + SplatLane(nElts, "__b", "__c") + ";";
-    break;
-  case OpMulXLane:
-    s += MangleName("vmulx", typestr, ClassS) + "(__a, " +
-      SplatLane(nElts, "__b", "__c") + ");";
-    break;
-  case OpMul:
-    s += "__a * __b;";
-    break;
-  case OpFMlaN:
-    s += MangleName("vfma", typestr, ClassS);
-    s += "(__a, __b, " + Duplicate(nElts,typestr, "__c") + ");";
-    break;
-  case OpFMlsN:
-    s += MangleName("vfms", typestr, ClassS);
-    s += "(__a, __b, " + Duplicate(nElts,typestr, "__c") + ");";
-    break;
-  case OpMullLane:
-    s += MangleName("vmull", typestr, ClassS) + "(__a, " +
-      SplatLane(nElts, "__b", "__c") + ");";
-    break;
-  case OpMullHiLane:
-    s += MangleName("vmull", typestr, ClassS) + "(" +
-      GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");";
-    break;
-  case OpMlaN:
-    s += "__a + (__b * " + Duplicate(nElts, typestr, "__c") + ");";
-    break;
-  case OpMlaLane:
-    s += "__a + (__b * " + SplatLane(nElts, "__c", "__d") + ");";
-    break;
-  case OpMla:
-    s += "__a + (__b * __c);";
-    break;
-  case OpMlalN:
-    s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, " +
-      Duplicate(nElts, typestr, "__c") + ");";
-    break;
-  case OpMlalLane:
-    s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, " +
-      SplatLane(nElts, "__c", "__d") + ");";
-    break;
-  case OpMlalHiLane:
-    s += "__a + " + MangleName("vmull", typestr, ClassS) + "(" +
-      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
-    break;
-  case OpMlal:
-    s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
-    break;
-  case OpMullHi:
-    s += Gen2OpWith2High(typestr, "vmull", "__a", "__b");
-    break;
-  case OpMullHiP64: {
-    std::string Op1 = GetHigh("__a", typestr);
-    std::string Op2 = GetHigh("__b", typestr);
-    s += MangleName("vmull", typestr, ClassS);
-    s += "((poly64_t)" + Op1 + ", (poly64_t)" + Op2 + ");";
-    break;
-  }
-  case OpMullHiN:
-    s += MangleName("vmull_n", typestr, ClassS);
-    s += "(" + GetHigh("__a", typestr) + ", __b);";
-    return s;
-  case OpMlalHi:
-    s += Gen3OpWith2High(typestr, "vmlal", "__a", "__b", "__c");
-    break;
-  case OpMlalHiN:
-    s += MangleName("vmlal_n", typestr, ClassS);
-    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
-    return s;
-  case OpMlsN:
-    s += "__a - (__b * " + Duplicate(nElts, typestr, "__c") + ");";
-    break;
-  case OpMlsLane:
-    s += "__a - (__b * " + SplatLane(nElts, "__c", "__d") + ");";
-    break;
-  case OpFMSLane:
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n  ";
-    s += MangleName("vfma_lane", typestr, ClassS) + "(__a1, __b1, -__c1, __d);";
-    break;
-  case OpFMSLaneQ:
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n  ";
-    s += MangleName("vfma_laneq", typestr, ClassS) + "(__a1, __b1, -__c1, __d);";
-    break;
-  case OpMls:
-    s += "__a - (__b * __c);";
-    break;
-  case OpMlslN:
-    s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, " +
-      Duplicate(nElts, typestr, "__c") + ");";
-    break;
-  case OpMlslLane:
-    s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, " +
-      SplatLane(nElts, "__c", "__d") + ");";
-    break;
-  case OpMlslHiLane:
-    s += "__a - " + MangleName("vmull", typestr, ClassS) + "(" +
-      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
-    break;
-  case OpMlsl:
-    s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
-    break;
-  case OpMlslHi:
-    s += Gen3OpWith2High(typestr, "vmlsl", "__a", "__b", "__c");
-    break;
-  case OpMlslHiN:
-    s += MangleName("vmlsl_n", typestr, ClassS);
-    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
-    break;
-  case OpQDMullLane:
-    s += MangleName("vqdmull", typestr, ClassS) + "(__a, " +
-      SplatLane(nElts, "__b", "__c") + ");";
-    break;
-  case OpQDMullHiLane:
-    s += MangleName("vqdmull", typestr, ClassS) + "(" +
-      GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");";
-    break;
-  case OpQDMlalLane:
-    s += MangleName("vqdmlal", typestr, ClassS) + "(__a, __b, " +
-      SplatLane(nElts, "__c", "__d") + ");";
-    break;
-  case OpQDMlalHiLane:
-    s += MangleName("vqdmlal", typestr, ClassS) + "(__a, " +
-      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
-    break;
-  case OpQDMlslLane:
-    s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, __b, " +
-      SplatLane(nElts, "__c", "__d") + ");";
-    break;
-  case OpQDMlslHiLane:
-    s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, " +
-      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
-    break;
-  case OpQDMulhLane:
-    s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " +
-      SplatLane(nElts, "__b", "__c") + ");";
-    break;
-  case OpQRDMulhLane:
-    s += MangleName("vqrdmulh", typestr, ClassS) + "(__a, " +
-      SplatLane(nElts, "__b", "__c") + ");";
-    break;
-  case OpEq:
-    s += "(" + ts + ")(__a == __b);";
-    break;
-  case OpGe:
-    s += "(" + ts + ")(__a >= __b);";
-    break;
-  case OpLe:
-    s += "(" + ts + ")(__a <= __b);";
-    break;
-  case OpGt:
-    s += "(" + ts + ")(__a > __b);";
-    break;
-  case OpLt:
-    s += "(" + ts + ")(__a < __b);";
-    break;
-  case OpNeg:
-    s += " -__a;";
-    break;
-  case OpNot:
-    s += " ~__a;";
-    break;
-  case OpAnd:
-    s += "__a & __b;";
-    break;
-  case OpOr:
-    s += "__a | __b;";
-    break;
-  case OpXor:
-    s += "__a ^ __b;";
-    break;
-  case OpAndNot:
-    s += "__a & ~__b;";
-    break;
-  case OpOrNot:
-    s += "__a | ~__b;";
-    break;
-  case OpCast:
-    s += "(" + ts + ")__a;";
-    break;
-  case OpConcat:
-    s += "(" + ts + ")__builtin_shufflevector((int64x1_t)__a";
-    s += ", (int64x1_t)__b, 0, 1);";
-    break;
-  case OpHi:
-    // nElts is for the result vector, so the source is twice that number.
-    s += "__builtin_shufflevector(__a, __a";
-    for (unsigned i = nElts; i < nElts * 2; ++i)
-      s += ", " + utostr(i);
-    s+= ");";
-    break;
-  case OpLo:
-    s += "__builtin_shufflevector(__a, __a";
-    for (unsigned i = 0; i < nElts; ++i)
-      s += ", " + utostr(i);
-    s+= ");";
-    break;
-  case OpDup:
-    s += Duplicate(nElts, typestr, "__a") + ";";
-    break;
-  case OpDupLane:
-    s += SplatLane(nElts, "__a", "__b") + ";";
-    break;
-  case OpSelect:
-    // ((0 & 1) | (~0 & 2))
-    s += "(" + ts + ")";
-    ts = TypeString(proto[1], typestr);
-    s += "((__a & (" + ts + ")__b) | ";
-    s += "(~__a & (" + ts + ")__c));";
-    break;
-  case OpRev16:
-    s += "__builtin_shufflevector(__a, __a";
-    for (unsigned i = 2; i <= nElts; i += 2)
-      for (unsigned j = 0; j != 2; ++j)
-        s += ", " + utostr(i - j - 1);
-    s += ");";
-    break;
-  case OpRev32: {
-    unsigned WordElts = nElts >> (1 + (int)quad);
-    s += "__builtin_shufflevector(__a, __a";
-    for (unsigned i = WordElts; i <= nElts; i += WordElts)
-      for (unsigned j = 0; j != WordElts; ++j)
-        s += ", " + utostr(i - j - 1);
-    s += ");";
-    break;
-  }
-  case OpRev64: {
-    unsigned DblWordElts = nElts >> (int)quad;
-    s += "__builtin_shufflevector(__a, __a";
-    for (unsigned i = DblWordElts; i <= nElts; i += DblWordElts)
-      for (unsigned j = 0; j != DblWordElts; ++j)
-        s += ", " + utostr(i - j - 1);
-    s += ");";
-    break;
-  }
-  case OpXtnHi: {
-    s = TypeString(proto[1], typestr) + " __a1 = " +
-        MangleName("vmovn", typestr, ClassS) + "(__b);\n  " +
-        "return __builtin_shufflevector(__a, __a1";
-    for (unsigned i = 0; i < nElts * 4; ++i)
-      s += ", " + utostr(i);
-    s += ");";
-    break;
-  }
-  case OpSqxtunHi: {
-    s = TypeString(proto[1], typestr) + " __a1 = " +
-        MangleName("vqmovun", typestr, ClassS) + "(__b);\n  " +
-        "return __builtin_shufflevector(__a, __a1";
-    for (unsigned i = 0; i < nElts * 4; ++i)
-      s += ", " + utostr(i);
-    s += ");";
-    break;
-  }
-  case OpQxtnHi: {
-    s = TypeString(proto[1], typestr) + " __a1 = " +
-        MangleName("vqmovn", typestr, ClassS) + "(__b);\n  " +
-        "return __builtin_shufflevector(__a, __a1";
-    for (unsigned i = 0; i < nElts * 4; ++i)
-      s += ", " + utostr(i);
-    s += ");";
-    break;
-  }
-  case OpFcvtnHi: {
-    std::string FName = (nElts == 1) ? "vcvt_f32" : "vcvt_f16";
-    s = TypeString(proto[1], typestr) + " __a1 = " +
-        MangleName(FName, typestr, ClassS) + "(__b);\n  " +
-        "return __builtin_shufflevector(__a, __a1";
-    for (unsigned i = 0; i < nElts * 4; ++i)
-      s += ", " + utostr(i);
-    s += ");";
-    break;
-  }
-  case OpFcvtlHi: {
-    std::string FName = (nElts == 2) ? "vcvt_f64" : "vcvt_f32";
-    s = TypeString('d', typestr) + " __a1 = " + GetHigh("__a", typestr) +
-        ";\n  return " + MangleName(FName, typestr, ClassS) + "(__a1);";
-    break;
-  }
-  case OpFcvtxnHi: {
-    s = TypeString(proto[1], typestr) + " __a1 = " +
-        MangleName("vcvtx_f32", typestr, ClassS) + "(__b);\n  " +
-        "return __builtin_shufflevector(__a, __a1";
-    for (unsigned i = 0; i < nElts * 4; ++i)
-      s += ", " + utostr(i);
-    s += ");";
-    break;
-  }
-  case OpUzp1:
-    s += "__builtin_shufflevector(__a, __b";
-    for (unsigned i = 0; i < nElts; i++)
-      s += ", " + utostr(2*i);
-    s += ");";
-    break;
-  case OpUzp2:
-    s += "__builtin_shufflevector(__a, __b";
-    for (unsigned i = 0; i < nElts; i++)
-      s += ", " + utostr(2*i+1);
-    s += ");";
-    break;
-  case OpZip1:
-    s += "__builtin_shufflevector(__a, __b";
-    for (unsigned i = 0; i < (nElts/2); i++)
-       s += ", " + utostr(i) + ", " + utostr(i+nElts);
-    s += ");";
-    break;
-  case OpZip2:
-    s += "__builtin_shufflevector(__a, __b";
-    for (unsigned i = nElts/2; i < nElts; i++)
-       s += ", " + utostr(i) + ", " + utostr(i+nElts);
-    s += ");";
-    break;
-  case OpTrn1:
-    s += "__builtin_shufflevector(__a, __b";
-    for (unsigned i = 0; i < (nElts/2); i++)
-       s += ", " + utostr(2*i) + ", " + utostr(2*i+nElts);
-    s += ");";
-    break;
-  case OpTrn2:
-    s += "__builtin_shufflevector(__a, __b";
-    for (unsigned i = 0; i < (nElts/2); i++)
-       s += ", " + utostr(2*i+1) + ", " + utostr(2*i+1+nElts);
-    s += ");";
-    break;
-  case OpAbdl: {
-    std::string abd = MangleName("vabd", typestr, ClassS) + "(__a, __b)";
-    if (typestr[0] != 'U') {
-      // vabd results are always unsigned and must be zero-extended.
-      std::string utype = "U" + typestr.str();
-      s += "(" + TypeString(proto[0], typestr) + ")";
-      abd = "(" + TypeString('d', utype) + ")" + abd;
-      s += Extend(utype, abd) + ";";
-    } else {
-      s += Extend(typestr, abd) + ";";
-    }
-    break;
-  }
-  case OpAbdlHi:
-    s += Gen2OpWith2High(typestr, "vabdl", "__a", "__b");
-    break;
-  case OpAddhnHi: {
-    std::string addhn = MangleName("vaddhn", typestr, ClassS) + "(__b, __c)";
-    s += GenCombine(GetNarrowTypestr(typestr), "__a", addhn);
-    s += ";";
-    break;
-  }
-  case OpRAddhnHi: {
-    std::string raddhn = MangleName("vraddhn", typestr, ClassS) + "(__b, __c)";
-    s += GenCombine(GetNarrowTypestr(typestr), "__a", raddhn);
-    s += ";";
-    break;
-  }
-  case OpSubhnHi: {
-    std::string subhn = MangleName("vsubhn", typestr, ClassS) + "(__b, __c)";
-    s += GenCombine(GetNarrowTypestr(typestr), "__a", subhn);
-    s += ";";
-    break;
-  }
-  case OpRSubhnHi: {
-    std::string rsubhn = MangleName("vrsubhn", typestr, ClassS) + "(__b, __c)";
-    s += GenCombine(GetNarrowTypestr(typestr), "__a", rsubhn);
-    s += ";";
-    break;
-  }
-  case OpAba:
-    s += "__a + " + MangleName("vabd", typestr, ClassS) + "(__b, __c);";
-    break;
-  case OpAbal:
-    s += "__a + " + MangleName("vabdl", typestr, ClassS) + "(__b, __c);";
-    break;
-  case OpAbalHi:
-    s += Gen3OpWith2High(typestr, "vabal", "__a", "__b", "__c");
-    break;
-  case OpQDMullHi:
-    s += Gen2OpWith2High(typestr, "vqdmull", "__a", "__b");
-    break;
-  case OpQDMullHiN:
-    s += MangleName("vqdmull_n", typestr, ClassS);
-    s += "(" + GetHigh("__a", typestr) + ", __b);";
-    return s;
-  case OpQDMlalHi:
-    s += Gen3OpWith2High(typestr, "vqdmlal", "__a", "__b", "__c");
-    break;
-  case OpQDMlalHiN:
-    s += MangleName("vqdmlal_n", typestr, ClassS);
-    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
-    return s;
-  case OpQDMlslHi:
-    s += Gen3OpWith2High(typestr, "vqdmlsl", "__a", "__b", "__c");
-    break;
-  case OpQDMlslHiN:
-    s += MangleName("vqdmlsl_n", typestr, ClassS);
-    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
-    return s;
-  case OpDiv:
-    s += "__a / __b;";
-    break;
-  case OpMovlHi: {
-    s = TypeString(proto[1], typestr.drop_front()) + " __a1 = " +
-        MangleName("vget_high", typestr, ClassS) + "(__a);\n  " + s;
-    s += "(" + ts + ")" + MangleName("vshll_n", typestr, ClassS);
-    s += "(__a1, 0);";
-    break;
-  }
-  case OpLongHi: {
-    // Another local variable __a1 is needed for calling a Macro,
-    // or using __a will have naming conflict when Macro expanding.
-    s += TypeString(proto[1], typestr.drop_front()) + " __a1 = " +
-         MangleName("vget_high", typestr, ClassS) + "(__a); \\\n";
-    s += "  (" + ts + ")" + MangleName(RemoveHigh(name), typestr, ClassS) +
-         "(__a1, __b);";
-    break;
-  }
-  case OpNarrowHi: {
-    s += "(" + ts + ")" + MangleName("vcombine", typestr, ClassS) + "(__a, " +
-         MangleName(RemoveHigh(name), typestr, ClassS) + "(__b, __c));";
-    break;
-  }
-  case OpCopyLane: {
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n  ";
-    s += TypeString('s', typestr) + " __c2 = " +
-         MangleName("vget_lane", typestr, ClassS) + "(__c1, __d); \\\n  " +
-         MangleName("vset_lane", typestr, ClassS) + "(__c2, __a1, __b);";
-    break;
-  }
-  case OpCopyQLane: {
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n  ";
-    s += TypeString('s', typestr) + " __c2 = vget_lane_" + typeCode +
-         "(__c1, __d); \\\n  vsetq_lane_" + typeCode + "(__c2, __a1, __b);";
-    break;
-  }
-  case OpCopyLaneQ: {
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n  ";
-    s += TypeString('s', typestr) + " __c2 = vgetq_lane_" + typeCode +
-         "(__c1, __d); \\\n  vset_lane_" + typeCode + "(__c2, __a1, __b);";
-    break;
-  }
-  case OpScalarMulLane: {
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString('s', typestr) + " __d1 = vget_lane_" + typeCode +
-      "(__b, __c);\\\n  __a * __d1;";
-    break;
-  }
-  case OpScalarMulLaneQ: {
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += TypeString('s', typestr) + " __d1 = vgetq_lane_" + typeCode +
-      "(__b1, __c);\\\n  __a1 * __d1;";
-    break;
-  }
-  case OpScalarMulXLane: {
-    bool dummy = false;
-    char type = ClassifyType(typestr, dummy, dummy, dummy);
-    if (type == 'f') type = 's';
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += TypeString('s', typestr) + " __d1 = vget_lane_" + typeCode +
-      "(__b1, __c);\\\n  vmulx" + type + "_" +
-      typeCode +  "(__a1, __d1);";
-    break;
-  }
-  case OpScalarMulXLaneQ: {
-    bool dummy = false;
-    char type = ClassifyType(typestr, dummy, dummy, dummy);
-    if (type == 'f') type = 's';
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += TypeString('s', typestr) + " __d1 = vgetq_lane_" +
-      typeCode + "(__b1, __c);\\\n  vmulx" + type +
-      "_" + typeCode +  "(__a1, __d1);";
-    break;
+  StringRef N = Name;
+  if (hasSplat()) {
+    // Call the non-splat builtin: chop off the "_n" suffix from the name.
+    assert(N.endswith("_n"));
+    N = N.drop_back(2);
   }
 
-  case OpScalarVMulXLane: {
-    bool dummy = false;
-    char type = ClassifyType(typestr, dummy, dummy, dummy);
-    if (type == 'f') type = 's';
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += TypeString('s', typestr) + " __d1 = vget_lane_" +
-      typeCode + "(__a1, 0);\\\n" +
-      "  " + TypeString('s', typestr) + " __e1 = vget_lane_" +
-      typeCode + "(__b1, __c);\\\n" +
-      "  " + TypeString('s', typestr) + " __f1 = vmulx" + type + "_" +
-      typeCode + "(__d1, __e1);\\\n" +
-      "  " + TypeString('d', typestr) + " __g1;\\\n" +
-      "  vset_lane_" + typeCode + "(__f1, __g1, __c);";
-    break;
-  }
+  ClassKind LocalCK = CK;
+  if (!protoHasScalar())
+    LocalCK = ClassB;
 
-  case OpScalarVMulXLaneQ: {
-    bool dummy = false;
-    char type = ClassifyType(typestr, dummy, dummy, dummy);
-    if (type == 'f') type = 's';
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += TypeString('s', typestr) + " __d1 = vget_lane_" +
-      typeCode + "(__a1, 0);\\\n" +
-      "  " + TypeString('s', typestr) + " __e1 = vgetq_lane_" +
-      typeCode + "(__b1, __c);\\\n" +
-      "  " + TypeString('s', typestr) + " __f1 = vmulx" + type + "_" +
-      typeCode + "(__d1, __e1);\\\n" +
-      "  " + TypeString('d', typestr) + " __g1;\\\n" +
-      "  vset_lane_" + typeCode + "(__f1, __g1, 0);";
-    break;
-  }
-  case OpScalarQDMullLane: {
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += MangleName("vqdmull", typestr, ClassS) + "(__a1, " +
-    "vget_lane_" + typeCode + "(__b1, __c));";
-    break;
-  }
-  case OpScalarQDMullLaneQ: {
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += MangleName("vqdmull", typestr, ClassS) + "(__a1, " +
-    "vgetq_lane_" + typeCode + "(__b1, __c));";
-    break;
-  }
-  case OpScalarQDMulHiLane: {
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += MangleName("vqdmulh", typestr, ClassS) + "(__a1, " +
-    "vget_lane_" + typeCode + "(__b1, __c));";
-    break;
-  }
-  case OpScalarQDMulHiLaneQ: {
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += MangleName("vqdmulh", typestr, ClassS) + "(__a1, " +
-    "vgetq_lane_" + typeCode + "(__b1, __c));";
-    break;
-  }
-  case OpScalarQRDMulHiLane: {
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += MangleName("vqrdmulh", typestr, ClassS) + "(__a1, " +
-    "vget_lane_" + typeCode + "(__b1, __c));";
-    break;
-  }
-  case OpScalarQRDMulHiLaneQ: {
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
-    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
-    s += MangleName("vqrdmulh", typestr, ClassS) + "(__a1, " +
-    "vgetq_lane_" + typeCode + "(__b1, __c));";
-    break;
-  }
-  case OpScalarGetLane:{
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
+  if (!getReturnType().isVoid() && !SRet)
+    S += "(" + RetVar.getType().str() + ") ";
 
-    std::string intType = quad ? "int16x8_t" : "int16x4_t";
-    std::string intName = quad ? "vgetq" : "vget";
+  S += "__builtin_neon_" + mangleName(N, LocalCK) + "(";
 
-    // reinterpret float16 vector as int16 vector
-    s += intType + " __a2 = *(" + intType + " *)(&__a1);\\\n";
+  if (SRet)
+    S += "&" + RetVar.getName() + ", ";
 
-    s += "  int16_t __a3 = " + intName + "_lane_s16(__a2, __b);\\\n";
+  for (unsigned I = 0; I < getNumParams(); ++I) {
+    Variable &V = Variables["p" + utostr(I)];
+    Type T = V.getType();
 
-    // reinterpret int16 vector as float16 vector
-    s += "  float16_t __a4 = *(float16_t *)(&__a3);\\\n";
-    s += "  __a4;";
-    break;
-  }
-  case OpScalarSetLane:{
-    std::string typeCode = "";
-    InstructionTypeCode(typestr, ClassS, quad, typeCode);
-    s += TypeString(proto[1], typestr) + " __a1 = __a;\\\n  ";
+    // Handle multiple-vector values specially, emitting each subvector as an
+    // argument to the builtin.
+    if (T.getNumVectors() > 1) {
+      // Check if an explicit cast is needed.
+      std::string Cast;
+      if (T.isChar() || T.isPoly() || !T.isSigned()) {
+        Type T2 = T;
+        T2.makeOneVector();
+        T2.makeInteger(8, /*Signed=*/true);
+        Cast = "(" + T2.str() + ")";
+      }
 
-    std::string origType = quad ? "float16x8_t" : "float16x4_t";
-    std::string intType = quad ? "int16x8_t" : "int16x4_t";
-    std::string intName = quad ? "vsetq" : "vset";
+      for (unsigned J = 0; J < T.getNumVectors(); ++J)
+        S += Cast + V.getName() + ".val[" + utostr(J) + "], ";
+      continue;
+    }
 
-    // reinterpret float16_t as int16_t
-    s += "int16_t __a2 = *(int16_t *)(&__a1);\\\n";
-    // reinterpret float16 vector as int16 vector
-    s += "  " + intType + " __b2 = *(" + intType + " *)(&__b);\\\n";
+    std::string Arg;
+    Type CastToType = T;
+    if (hasSplat() && I == getSplatIdx()) {
+      Arg = "(" + BaseType.str() + ") {";
+      for (unsigned J = 0; J < BaseType.getNumElements(); ++J) {
+        if (J != 0)
+          Arg += ", ";
+        Arg += V.getName();
+      }
+      Arg += "}";
 
-    s += "  " + intType + " __b3 = " + intName + "_lane_s16(__a2, __b2, __c);\\\n";
+      CastToType = BaseType;
+    } else {
+      Arg = V.getName();
+    }
 
-    // reinterpret int16 vector as float16 vector
-    s += "  " + origType + " __b4 = *(" + origType + " *)(&__b3);\\\n";
-    s += "__b4;";
-    break;
+    // Check if an explicit cast is needed.
+    if (CastToType.isVector()) {
+      CastToType.makeInteger(8, true);
+      Arg = "(" + CastToType.str() + ")" + Arg;
+    }
+
+    S += Arg + ", ";
   }
 
-  default:
-    PrintFatalError("unknown OpKind!");
+  // Extra constant integer to hold type class enum for this function, e.g. s8
+  if (getClassKind(true) == ClassB) {
+    Type ThisTy = getReturnType();
+    if (Proto[0] == 'v' || Proto[0] == 'f' || Proto[0] == 'F')
+      ThisTy = getParamType(0);
+    if (ThisTy.isPointer())
+      ThisTy = getParamType(1);
+
+    S += utostr(ThisTy.getNeonEnum());
+  } else {
+    // Remove extraneous ", ".
+    S.pop_back();
+    S.pop_back();
   }
-  return s;
+  S += ");";
+
+  std::string RetExpr;
+  if (!SRet && !RetVar.getType().isVoid())
+    RetExpr = RetVar.getName() + " = ";
+
+  OS << "  " << RetExpr << S;
+  emitNewLine();
 }
 
-static unsigned GetNeonEnum(const std::string &proto, StringRef typestr) {
-  unsigned mod = proto[0];
+void Intrinsic::emitBody() {
+  std::vector<std::string> Lines;
 
-  if (mod == 'v' || mod == 'f' || mod == 'F')
-    mod = proto[1];
+  assert(RetVar.getType() == Types[0]);
+  // Create a return variable, if we're not void.
+  if (!RetVar.getType().isVoid()) {
+    OS << "  " << RetVar.getType().str() << " " << RetVar.getName() << ";";
+    emitNewLine();
+  }
 
-  bool quad = false;
-  bool poly = false;
-  bool usgn = false;
-  bool scal = false;
-  bool cnst = false;
-  bool pntr = false;
+  if (!Body || Body->getValues().size() == 0) {
+    // Nothing specific to output - must output a builtin.
+    emitBodyAsBuiltinCall();
+    return;
+  }
 
-  // Base type to get the type string for.
-  char type = ClassifyType(typestr, quad, poly, usgn);
+  // We have a list of "things to output". The last should be returned.
+  for (auto *I : Body->getValues()) {
+    if (StringInit *SI = dyn_cast<StringInit>(I)) {
+      Lines.push_back(replaceParamsIn(SI->getAsString()));
+    } else if (DagInit *DI = dyn_cast<DagInit>(I)) {
+      Lines.push_back(emitDag(DI).second + ";");
+    }
+  }
 
-  // Based on the modifying character, change the type and width if necessary.
-  type = ModType(mod, type, quad, poly, usgn, scal, cnst, pntr);
+  assert(Lines.size() && "Empty def?");
+  if (!RetVar.getType().isVoid())
+    Lines.back().insert(0, RetVar.getName() + " = ");
 
-  NeonTypeFlags::EltType ET;
-  switch (type) {
-    case 'c':
-      ET = poly ? NeonTypeFlags::Poly8 : NeonTypeFlags::Int8;
-      break;
-    case 's':
-      ET = poly ? NeonTypeFlags::Poly16 : NeonTypeFlags::Int16;
-      break;
-    case 'i':
-      ET = NeonTypeFlags::Int32;
-      break;
-    case 'l':
-      ET = poly ? NeonTypeFlags::Poly64 : NeonTypeFlags::Int64;
-      break;
-    case 'k':
-      ET = NeonTypeFlags::Poly128;
-      break;
-    case 'h':
-      ET = NeonTypeFlags::Float16;
-      break;
-    case 'f':
-      ET = NeonTypeFlags::Float32;
-      break;
-    case 'd':
-      ET = NeonTypeFlags::Float64;
-      break;
-    default:
-      PrintFatalError("unhandled type!");
+  for (auto &L : Lines) {
+    OS << "  " << L;
+    emitNewLine();
   }
-  NeonTypeFlags Flags(ET, usgn, quad && proto[1] != 'g');
-  return Flags.getFlags();
 }
 
-// We don't check 'a' in this function, because for builtin function the
-// argument matching to 'a' uses a vector type splatted from a scalar type.
-static bool ProtoHasScalar(const std::string proto)
-{
-  return (proto.find('s') != std::string::npos
-          || proto.find('z') != std::string::npos
-          || proto.find('r') != std::string::npos
-          || proto.find('b') != std::string::npos
-          || proto.find('$') != std::string::npos
-          || proto.find('y') != std::string::npos
-          || proto.find('o') != std::string::npos);
+void Intrinsic::emitReturn() {
+  if (RetVar.getType().isVoid())
+    return;
+  if (UseMacro)
+    OS << "  " << RetVar.getName() << ";";
+  else
+    OS << "  return " << RetVar.getName() << ";";
+  emitNewLine();
 }
 
-// Generate the definition for this intrinsic, e.g. __builtin_neon_cls(a)
-static std::string GenBuiltin(const std::string &name, const std::string &proto,
-                              StringRef typestr, ClassKind ck) {
-  std::string s;
-
-  // If this builtin returns a struct 2, 3, or 4 vectors, pass it as an implicit
-  // sret-like argument.
-  bool sret = IsMultiVecProto(proto[0]);
-
-  bool define = UseMacro(proto, typestr);
-
-  // Check if the prototype has a scalar operand with the type of the vector
-  // elements.  If not, bitcasting the args will take care of arg checking.
-  // The actual signedness etc. will be taken care of with special enums.
-  if (!ProtoHasScalar(proto))
-    ck = ClassB;
-
-  if (proto[0] != 'v') {
-    std::string ts = TypeString(proto[0], typestr);
-
-    if (define) {
-      if (sret)
-        s += ts + " r; ";
-      else
-        s += "(" + ts + ")";
-    } else if (sret) {
-      s += ts + " r; ";
-    } else {
-      s += "return (" + ts + ")";
-    }
-  }
-
-  bool splat = proto.find('a') != std::string::npos;
+std::pair<Type, std::string> Intrinsic::emitDag(DagInit *DI) {
+  // At this point we should only be seeing a def.
+  DefInit *DefI = cast<DefInit>(DI->getOperator());
+  std::string Op = DefI->getAsString();
+
+  if (Op == "cast" || Op == "bitcast")
+    return emitDagCast(DI, Op == "bitcast");
+  if (Op == "shuffle")
+    return emitDagShuffle(DI);
+  if (Op == "dup")
+    return emitDagDup(DI);
+  if (Op == "splat")
+    return emitDagSplat(DI);
+  if (Op == "save_temp")
+    return emitDagSaveTemp(DI);
+  if (Op == "op")
+    return emitDagOp(DI);
+  if (Op == "call")
+    return emitDagCall(DI);
+  if (Op == "name_replace")
+    return emitDagNameReplace(DI);
+  if (Op == "literal")
+    return emitDagLiteral(DI);
+  assert_with_loc(false, "Unknown operation!");
+  return std::make_pair(Type::getVoid(), "");
+}
 
-  s += "__builtin_neon_";
-  if (splat) {
-    // Call the non-splat builtin: chop off the "_n" suffix from the name.
-    std::string vname(name, 0, name.size()-2);
-    s += MangleName(vname, typestr, ck);
+std::pair<Type, std::string> Intrinsic::emitDagOp(DagInit *DI) {
+  std::string Op = cast<StringInit>(DI->getArg(0))->getAsUnquotedString();
+  if (DI->getNumArgs() == 2) {
+    // Unary op.
+    std::pair<Type, std::string> R =
+        emitDagArg(DI->getArg(1), DI->getArgName(1));
+    return std::make_pair(R.first, Op + R.second);
   } else {
-    s += MangleName(name, typestr, ck);
+    assert(DI->getNumArgs() == 3 && "Can only handle unary and binary ops!");
+    std::pair<Type, std::string> R1 =
+        emitDagArg(DI->getArg(1), DI->getArgName(1));
+    std::pair<Type, std::string> R2 =
+        emitDagArg(DI->getArg(2), DI->getArgName(2));
+    assert_with_loc(R1.first == R2.first, "Argument type mismatch!");
+    return std::make_pair(R1.first, R1.second + " " + Op + " " + R2.second);
   }
-  s += "(";
-
-  // Pass the address of the return variable as the first argument to sret-like
-  // builtins.
-  if (sret)
-    s += "&r, ";
+}
 
-  char arg = 'a';
-  for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
-    std::string args = std::string(&arg, 1);
+std::pair<Type, std::string> Intrinsic::emitDagCall(DagInit *DI) {
+  std::vector<Type> Types;
+  std::vector<std::string> Values;
+  for (unsigned I = 0; I < DI->getNumArgs() - 1; ++I) {
+    std::pair<Type, std::string> R =
+        emitDagArg(DI->getArg(I + 1), DI->getArgName(I + 1));
+    Types.push_back(R.first);
+    Values.push_back(R.second);
+  }
 
-    // Use the local temporaries instead of the macro arguments.
-    args = "__" + args;
+  // Look up the called intrinsic.
+  std::string N;
+  if (StringInit *SI = dyn_cast<StringInit>(DI->getArg(0)))
+    N = SI->getAsUnquotedString();
+  else
+    N = emitDagArg(DI->getArg(0), "").second;
+  Intrinsic *Callee = Emitter.getIntrinsic(N, Types);
+  assert(Callee && "getIntrinsic should not return us nullptr!");
 
-    bool argQuad = false;
-    bool argPoly = false;
-    bool argUsgn = false;
-    bool argScalar = false;
-    bool dummy = false;
-    char argType = ClassifyType(typestr, argQuad, argPoly, argUsgn);
-    argType = ModType(proto[i], argType, argQuad, argPoly, argUsgn, argScalar,
-                      dummy, dummy);
+  // Make sure the callee is known as an early def.
+  Callee->setNeededEarly();
+  Dependencies.insert(Callee);
 
-    // Handle multiple-vector values specially, emitting each subvector as an
-    // argument to the __builtin.
-    unsigned NumOfVec = 0;
-    if (proto[i] >= '2' && proto[i] <= '4') {
-      NumOfVec = proto[i] - '0';
-    } else if (proto[i] >= 'B' && proto[i] <= 'D') {
-      NumOfVec = proto[i] - 'A' + 1;
-    }
+  // Now create the call itself.
+  std::string S = Callee->getMangledName(true) + "(";
+  for (unsigned I = 0; I < DI->getNumArgs() - 1; ++I) {
+    if (I != 0)
+      S += ", ";
+    S += Values[I];
+  }
+  S += ")";
 
-    if (NumOfVec > 0) {
-      // Check if an explicit cast is needed.
-      if (argType != 'c' || argPoly || argUsgn)
-        args = (argQuad ? "(int8x16_t)" : "(int8x8_t)") + args;
+  return std::make_pair(Callee->getReturnType(), S);
+}
 
-      for (unsigned vi = 0, ve = NumOfVec; vi != ve; ++vi) {
-        s += args + ".val[" + utostr(vi) + "]";
-        if ((vi + 1) < ve)
-          s += ", ";
+std::pair<Type, std::string> Intrinsic::emitDagCast(DagInit *DI,
+                                                    bool IsBitCast) {
+  // (cast MOD* VAL) -> cast VAL to type given by MOD.
+  std::pair<Type, std::string> R = emitDagArg(
+      DI->getArg(DI->getNumArgs() - 1), DI->getArgName(DI->getNumArgs() - 1));
+  Type castToType = R.first;
+  for (unsigned ArgIdx = 0; ArgIdx < DI->getNumArgs() - 1; ++ArgIdx) {
+
+    // MOD can take several forms:
+    //   1. $X - take the type of parameter / variable X.
+    //   2. The value "R" - take the type of the return type.
+    //   3. a type string
+    //   4. The value "U" or "S" to switch the signedness.
+    //   5. The value "H" or "D" to half or double the bitwidth.
+    //   6. The value "8" to convert to 8-bit (signed) integer lanes.
+    if (DI->getArgName(ArgIdx).size()) {
+      assert_with_loc(Variables.find(DI->getArgName(ArgIdx)) != Variables.end(),
+                      "Variable not found");
+      castToType = Variables[DI->getArgName(ArgIdx)].getType();
+    } else {
+      StringInit *SI = dyn_cast<StringInit>(DI->getArg(ArgIdx));
+      assert_with_loc(SI, "Expected string type or $Name for cast type");
+
+      if (SI->getAsUnquotedString() == "R") {
+        castToType = getReturnType();
+      } else if (SI->getAsUnquotedString() == "U") {
+        castToType.makeUnsigned();
+      } else if (SI->getAsUnquotedString() == "S") {
+        castToType.makeSigned();
+      } else if (SI->getAsUnquotedString() == "H") {
+        castToType.halveLanes();
+      } else if (SI->getAsUnquotedString() == "D") {
+        castToType.doubleLanes();
+      } else if (SI->getAsUnquotedString() == "8") {
+        castToType.makeInteger(8, true);
+      } else {
+        castToType = Type::fromTypedefName(SI->getAsUnquotedString());
+        assert_with_loc(!castToType.isVoid(), "Unknown typedef");
       }
-      if ((i + 1) < e)
-        s += ", ";
-
-      continue;
-    }
-
-    if (splat && (i + 1) == e)
-      args = Duplicate(GetNumElements(typestr, argQuad), typestr, args);
-
-    // Check if an explicit cast is needed.
-    if ((splat || !argScalar) &&
-        ((ck == ClassB && argType != 'c') || argPoly || argUsgn)) {
-      std::string argTypeStr = "c";
-      if (ck != ClassB)
-        argTypeStr = argType;
-      if (argQuad)
-        argTypeStr = "Q" + argTypeStr;
-      args = "(" + TypeString('d', argTypeStr) + ")" + args;
     }
-
-    s += args;
-    if ((i + 1) < e)
-      s += ", ";
   }
 
-  // Extra constant integer to hold type class enum for this function, e.g. s8
-  if (ck == ClassB)
-    s += ", " + utostr(GetNeonEnum(proto, typestr));
+  std::string S;
+  if (IsBitCast) {
+    // Emit a reinterpret cast. The second operand must be an lvalue, so create
+    // a temporary.
+    std::string N = "reint";
+    unsigned I = 0;
+    while (Variables.find(N) != Variables.end())
+      N = "reint" + utostr(++I);
+    Variables[N] = Variable(R.first, N + VariablePostfix);
 
-  s += ");";
+    OS << R.first.str() << " " << Variables[N].getName() << " = " << R.second
+       << ";";
+    emitNewLine();
 
-  if (proto[0] != 'v' && sret) {
-    if (define)
-      s += " r;";
-    else
-      s += " return r;";
+    S = "*(" + castToType.str() + " *) &" + Variables[N].getName() + "";
+  } else {
+    // Emit a normal (static) cast.
+    S = "(" + castToType.str() + ")(" + R.second + ")";
   }
-  return s;
+
+  return std::make_pair(castToType, S);
 }
 
-static std::string GenBuiltinDef(const std::string &name,
-                                 const std::string &proto,
-                                 StringRef typestr, ClassKind ck) {
-  std::string s("BUILTIN(__builtin_neon_");
+std::pair<Type, std::string> Intrinsic::emitDagShuffle(DagInit *DI) {
+  // See the documentation in arm_neon.td for a description of these operators.
+  class LowHalf : public SetTheory::Operator {
+  public:
+    virtual void anchor() {}
+    virtual ~LowHalf() {}
+    virtual void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
+                       ArrayRef<SMLoc> Loc) {
+      SetTheory::RecSet Elts2;
+      ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts2, Loc);
+      Elts.insert(Elts2.begin(), Elts2.begin() + (Elts2.size() / 2));
+    }
+  };
+  class HighHalf : public SetTheory::Operator {
+  public:
+    virtual void anchor() {}
+    virtual ~HighHalf() {}
+    virtual void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
+                       ArrayRef<SMLoc> Loc) {
+      SetTheory::RecSet Elts2;
+      ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts2, Loc);
+      Elts.insert(Elts2.begin() + (Elts2.size() / 2), Elts2.end());
+    }
+  };
+  class Rev : public SetTheory::Operator {
+    unsigned ElementSize;
+
+  public:
+    Rev(unsigned ElementSize) : ElementSize(ElementSize) {}
+    virtual void anchor() {}
+    virtual ~Rev() {}
+    virtual void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
+                       ArrayRef<SMLoc> Loc) {
+      SetTheory::RecSet Elts2;
+      ST.evaluate(Expr->arg_begin() + 1, Expr->arg_end(), Elts2, Loc);
+
+      int64_t VectorSize = cast<IntInit>(Expr->getArg(0))->getValue();
+      VectorSize /= ElementSize;
+
+      std::vector<Record *> Revved;
+      for (unsigned VI = 0; VI < Elts2.size(); VI += VectorSize) {
+        for (int LI = VectorSize - 1; LI >= 0; --LI) {
+          Revved.push_back(Elts2[VI + LI]);
+        }
+      }
 
-  // If all types are the same size, bitcasting the args will take care
-  // of arg checking.  The actual signedness etc. will be taken care of with
-  // special enums.
-  if (!ProtoHasScalar(proto))
-    ck = ClassB;
+      Elts.insert(Revved.begin(), Revved.end());
+    }
+  };
+  class MaskExpander : public SetTheory::Expander {
+    unsigned N;
+
+  public:
+    MaskExpander(unsigned N) : N(N) {}
+    virtual void anchor() {}
+    virtual ~MaskExpander() {}
+    virtual void expand(SetTheory &ST, Record *R, SetTheory::RecSet &Elts) {
+      unsigned Addend = 0;
+      if (R->getName() == "mask0")
+        Addend = 0;
+      else if (R->getName() == "mask1")
+        Addend = N;
+      else
+        return;
+      for (unsigned I = 0; I < N; ++I)
+        Elts.insert(R->getRecords().getDef("sv" + utostr(I + Addend)));
+    }
+  };
 
-  s += MangleName(name, typestr, ck);
-  s += ", \"";
+  // (shuffle arg1, arg2, sequence)
+  std::pair<Type, std::string> Arg1 =
+      emitDagArg(DI->getArg(0), DI->getArgName(0));
+  std::pair<Type, std::string> Arg2 =
+      emitDagArg(DI->getArg(1), DI->getArgName(1));
+  assert_with_loc(Arg1.first == Arg2.first,
+                  "Different types in arguments to shuffle!");
+
+  SetTheory ST;
+  LowHalf LH;
+  HighHalf HH;
+  MaskExpander ME(Arg1.first.getNumElements());
+  Rev R(Arg1.first.getElementSizeInBits());
+  SetTheory::RecSet Elts;
+  ST.addOperator("lowhalf", &LH);
+  ST.addOperator("highhalf", &HH);
+  ST.addOperator("rev", &R);
+  ST.addExpander("MaskExpand", &ME);
+  ST.evaluate(DI->getArg(2), Elts, ArrayRef<SMLoc>());
+
+  std::string S = "__builtin_shufflevector(" + Arg1.second + ", " + Arg2.second;
+  for (auto &E : Elts) {
+    StringRef Name = E->getName();
+    assert_with_loc(Name.startswith("sv"),
+                    "Incorrect element kind in shuffle mask!");
+    S += ", " + Name.drop_front(2).str();
+  }
+  S += ")";
+
+  // Recalculate the return type - the shuffle may have halved or doubled it.
+  Type T(Arg1.first);
+  if (Elts.size() > T.getNumElements()) {
+    assert_with_loc(
+        Elts.size() == T.getNumElements() * 2,
+        "Can only double or half the number of elements in a shuffle!");
+    T.doubleLanes();
+  } else if (Elts.size() < T.getNumElements()) {
+    assert_with_loc(
+        Elts.size() == T.getNumElements() / 2,
+        "Can only double or half the number of elements in a shuffle!");
+    T.halveLanes();
+  }
+
+  return std::make_pair(T, S);
+}
 
-  for (unsigned i = 0, e = proto.size(); i != e; ++i)
-    s += BuiltinTypeString(proto[i], typestr, ck, i == 0);
+std::pair<Type, std::string> Intrinsic::emitDagDup(DagInit *DI) {
+  assert_with_loc(DI->getNumArgs() == 1, "dup() expects one argument");
+  std::pair<Type, std::string> A = emitDagArg(DI->getArg(0), DI->getArgName(0));
+  assert_with_loc(A.first.isScalar(), "dup() expects a scalar argument");
 
-  // Extra constant integer to hold type class enum for this function, e.g. s8
-  if (ck == ClassB)
-    s += "i";
+  Type T = getBaseType();
+  assert_with_loc(T.isVector(), "dup() used but default type is scalar!");
+  std::string S = "(" + T.str() + ") {";
+  for (unsigned I = 0; I < T.getNumElements(); ++I) {
+    if (I != 0)
+      S += ", ";
+    S += A.second;
+  }
+  S += "}";
 
-  s += "\", \"n\")";
-  return s;
+  return std::make_pair(T, S);
 }
 
-static std::string GenIntrinsic(const std::string &name,
-                                const std::string &proto,
-                                StringRef outTypeStr, StringRef inTypeStr,
-                                OpKind kind, ClassKind classKind) {
-  assert(!proto.empty() && "");
-  bool define = UseMacro(proto, outTypeStr) && kind != OpUnavailable;
-  std::string s;
-
-  // static always inline + return type
-  if (define)
-    s += "#define ";
-  else
-    s += "__ai " + TypeString(proto[0], outTypeStr) + " ";
-
-  // Function name with type suffix
-  std::string mangledName = MangleName(name, outTypeStr, ClassS);
-  if (outTypeStr != inTypeStr) {
-    // If the input type is different (e.g., for vreinterpret), append a suffix
-    // for the input type.  String off a "Q" (quad) prefix so that MangleName
-    // does not insert another "q" in the name.
-    unsigned typeStrOff = (inTypeStr[0] == 'Q' ? 1 : 0);
-    StringRef inTypeNoQuad = inTypeStr.substr(typeStrOff);
-    mangledName = MangleName(mangledName, inTypeNoQuad, ClassS);
-  }
-  s += mangledName;
-
-  // Function arguments
-  s += GenArgs(proto, inTypeStr, name);
-
-  // Definition.
-  if (define) {
-    s += " __extension__ ({ \\\n  ";
-    s += GenMacroLocals(proto, inTypeStr, name);
-  } else if (kind == OpUnavailable) {
-    s += " __attribute__((unavailable));\n";
-    return s;
-  } else
-    s += " {\n  ";
-
-  if (kind != OpNone)
-    s += GenOpString(name, kind, proto, outTypeStr);
-  else
-    s += GenBuiltin(name, proto, outTypeStr, classKind);
-  if (define)
-    s += " })";
-  else
-    s += " }";
-  s += "\n";
-  return s;
+std::pair<Type, std::string> Intrinsic::emitDagSplat(DagInit *DI) {
+  assert_with_loc(DI->getNumArgs() == 2, "splat() expects two arguments");
+  std::pair<Type, std::string> A = emitDagArg(DI->getArg(0), DI->getArgName(0));
+  std::pair<Type, std::string> B = emitDagArg(DI->getArg(1), DI->getArgName(1));
+
+  assert_with_loc(B.first.isScalar(),
+                  "splat() requires a scalar int as the second argument");
+
+  std::string S = "__builtin_shufflevector(" + A.second + ", " + A.second;
+  for (unsigned I = 0; I < BaseType.getNumElements(); ++I) {
+    S += ", " + B.second;
+  }
+  S += ")";
+
+  return std::make_pair(BaseType, S);
 }
 
-/// run - Read the records in arm_neon.td and output arm_neon.h.  arm_neon.h
-/// is comprised of type definitions and function declarations.
-void NeonEmitter::run(raw_ostream &OS) {
-  OS <<
-    "/*===---- arm_neon.h - ARM Neon intrinsics ------------------------------"
-    "---===\n"
-    " *\n"
-    " * Permission is hereby granted, free of charge, to any person obtaining "
-    "a copy\n"
-    " * of this software and associated documentation files (the \"Software\"),"
-    " to deal\n"
-    " * in the Software without restriction, including without limitation the "
-    "rights\n"
-    " * to use, copy, modify, merge, publish, distribute, sublicense, "
-    "and/or sell\n"
-    " * copies of the Software, and to permit persons to whom the Software is\n"
-    " * furnished to do so, subject to the following conditions:\n"
-    " *\n"
-    " * The above copyright notice and this permission notice shall be "
-    "included in\n"
-    " * all copies or substantial portions of the Software.\n"
-    " *\n"
-    " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, "
-    "EXPRESS OR\n"
-    " * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF "
-    "MERCHANTABILITY,\n"
-    " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT "
-    "SHALL THE\n"
-    " * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR "
-    "OTHER\n"
-    " * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, "
-    "ARISING FROM,\n"
-    " * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER "
-    "DEALINGS IN\n"
-    " * THE SOFTWARE.\n"
-    " *\n"
-    " *===--------------------------------------------------------------------"
-    "---===\n"
-    " */\n\n";
+std::pair<Type, std::string> Intrinsic::emitDagSaveTemp(DagInit *DI) {
+  assert_with_loc(DI->getNumArgs() == 2, "save_temp() expects two arguments");
+  std::pair<Type, std::string> A = emitDagArg(DI->getArg(1), DI->getArgName(1));
 
-  OS << "#ifndef __ARM_NEON_H\n";
-  OS << "#define __ARM_NEON_H\n\n";
+  assert_with_loc(!A.first.isVoid(),
+                  "Argument to save_temp() must have non-void type!");
 
-  OS << "#if !defined(__ARM_NEON)\n";
-  OS << "#error \"NEON support not enabled\"\n";
-  OS << "#endif\n\n";
+  std::string N = DI->getArgName(0);
+  assert_with_loc(N.size(), "save_temp() expects a name as the first argument");
 
-  OS << "#include <stdint.h>\n\n";
+  assert_with_loc(Variables.find(N) == Variables.end(),
+                  "Variable already defined!");
+  Variables[N] = Variable(A.first, N + VariablePostfix);
 
-  // Emit NEON-specific scalar typedefs.
-  OS << "typedef float float32_t;\n";
-  OS << "typedef __fp16 float16_t;\n";
+  std::string S =
+      A.first.str() + " " + Variables[N].getName() + " = " + A.second;
 
-  OS << "#ifdef __aarch64__\n";
-  OS << "typedef double float64_t;\n";
-  OS << "#endif\n\n";
+  return std::make_pair(Type::getVoid(), S);
+}
 
-  // For now, signedness of polynomial types depends on target
-  OS << "#ifdef __aarch64__\n";
-  OS << "typedef uint8_t poly8_t;\n";
-  OS << "typedef uint16_t poly16_t;\n";
-  OS << "typedef uint64_t poly64_t;\n";
-  OS << "typedef __uint128_t poly128_t;\n";
-  OS << "#else\n";
-  OS << "typedef int8_t poly8_t;\n";
-  OS << "typedef int16_t poly16_t;\n";
-  OS << "#endif\n";
+std::pair<Type, std::string> Intrinsic::emitDagNameReplace(DagInit *DI) {
+  std::string S = Name;
 
-  // Emit Neon vector typedefs.
-  std::string TypedefTypes(
-      "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfdQdPcQPcPsQPsPlQPl");
-  SmallVector<StringRef, 24> TDTypeVec;
-  ParseTypes(nullptr, TypedefTypes, TDTypeVec);
+  assert_with_loc(DI->getNumArgs() == 2, "name_replace requires 2 arguments!");
+  std::string ToReplace = cast<StringInit>(DI->getArg(0))->getAsUnquotedString();
+  std::string ReplaceWith = cast<StringInit>(DI->getArg(1))->getAsUnquotedString();
 
-  // Emit vector typedefs.
-  bool isA64 = false;
-  bool preinsert;
-  bool postinsert;
-  for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
-    bool dummy, quad = false, poly = false;
-    char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
-    preinsert = false;
-    postinsert = false;
-
-    if (type == 'd' || (type == 'l' && poly)) {
-      preinsert = isA64? false: true;
-      isA64 = true;
-    } else {
-      postinsert = isA64? true: false;
-      isA64 = false;
-    }
-    if (postinsert)
-      OS << "#endif\n";
-    if (preinsert)
-      OS << "#ifdef __aarch64__\n";
+  size_t Idx = S.find(ToReplace);
 
-    if (poly)
-      OS << "typedef __attribute__((neon_polyvector_type(";
-    else
-      OS << "typedef __attribute__((neon_vector_type(";
+  assert_with_loc(Idx != std::string::npos, "name should contain '" + ToReplace + "'!");
+  S.replace(Idx, ToReplace.size(), ReplaceWith);
 
-    unsigned nElts = GetNumElements(TDTypeVec[i], quad);
-    OS << utostr(nElts) << "))) ";
-    if (nElts < 10)
-      OS << " ";
+  return std::make_pair(Type::getVoid(), S);
+}
 
-    OS << TypeString('s', TDTypeVec[i]);
-    OS << " " << TypeString('d', TDTypeVec[i]) << ";\n";
+std::pair<Type, std::string> Intrinsic::emitDagLiteral(DagInit *DI) {
+  std::string Ty = cast<StringInit>(DI->getArg(0))->getAsUnquotedString();
+  std::string Value = cast<StringInit>(DI->getArg(1))->getAsUnquotedString();
+  return std::make_pair(Type::fromTypedefName(Ty), Value);
+}
 
+std::pair<Type, std::string> Intrinsic::emitDagArg(Init *Arg,
+                                                   std::string ArgName) {
+  if (ArgName.size()) {
+    assert_with_loc(!Arg->isComplete(),
+                    "Arguments must either be DAGs or names, not both!");
+    assert_with_loc(Variables.find(ArgName) != Variables.end(),
+                    "Variable not defined!");
+    Variable &V = Variables[ArgName];
+    return std::make_pair(V.getType(), V.getName());
   }
-  postinsert = isA64? true: false;
-  if (postinsert)
-    OS << "#endif\n";
-  OS << "\n";
 
-  // Emit struct typedefs.
-  isA64 = false;
-  for (unsigned vi = 2; vi != 5; ++vi) {
-    for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
-      bool dummy, quad = false, poly = false;
-      char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
-      preinsert = false;
-      postinsert = false;
-
-      if (type == 'd' || (type == 'l' && poly)) {
-        preinsert = isA64? false: true;
-        isA64 = true;
-      } else {
-        postinsert = isA64? true: false;
-        isA64 = false;
-      }
-      if (postinsert)
-        OS << "#endif\n";
-      if (preinsert)
-        OS << "#ifdef __aarch64__\n";
+  assert(Arg && "Neither ArgName nor Arg?!");
+  DagInit *DI = dyn_cast<DagInit>(Arg);
+  assert_with_loc(DI, "Arguments must either be DAGs or names!");
 
-      std::string ts = TypeString('d', TDTypeVec[i]);
-      std::string vs = TypeString('0' + vi, TDTypeVec[i]);
-      OS << "typedef struct " << vs << " {\n";
-      OS << "  " << ts << " val";
-      OS << "[" << utostr(vi) << "]";
-      OS << ";\n} ";
-      OS << vs << ";\n";
-      OS << "\n";
-    }
-  }
-  postinsert = isA64? true: false;
-  if (postinsert)
-    OS << "#endif\n";
-  OS << "\n";
+  return emitDag(DI);
+}
 
-  OS<<"#define __ai static inline __attribute__((__always_inline__, __nodebug__))\n\n";
+std::string Intrinsic::generate() {
+  CurrentRecord = R;
 
-  std::vector<Record*> RV = Records.getAllDerivedDefinitions("Inst");
+  // If we call a macro, our local variables may be corrupted due to
+  // lack of proper lexical scoping. So, add a globally unique postfix
+  // to every variable.
+  //
+  // indexBody() should have set up the Dependencies set by now.
+  for (auto *I : Dependencies)
+    if (I->UseMacro) {
+      VariablePostfix = "_" + utostr(Emitter.getUniqueNumber());
+      break;
+    }
 
-  StringMap<ClassKind> EmittedMap;
-  std::string CurrentGuard = "";
-  bool InGuard = false;
+  initVariables();
 
-  // Some intrinsics are used to express others. These need to be emitted near
-  // the beginning so that the declarations are present when needed. This is
-  // rather an ugly, arbitrary list, but probably simpler than actually tracking
-  // dependency info.
-  static const char *EarlyDefsArr[] =
-      { "VFMA",      "VQMOVN",    "VQMOVUN",  "VABD",    "VMOVL",
-        "VABDL",     "VGET_HIGH", "VCOMBINE", "VSHLL_N", "VMOVL_HIGH",
-        "VMULL",     "VMLAL_N",   "VMLSL_N",  "VMULL_N", "VMULL_P64",
-        "VQDMLAL_N", "VQDMLSL_N", "VQDMULL_N" };
-  ArrayRef<const char *> EarlyDefs(EarlyDefsArr);
+  emitPrototype();
 
-  for (unsigned i = 0; i < EarlyDefs.size(); ++i) {
-    Record *R = Records.getDef(EarlyDefs[i]);
-    emitGuardedIntrinsic(OS, R, CurrentGuard, InGuard, EmittedMap);
+  if (IsUnavailable) {
+    OS << " __attribute__((unavailable));";
+  } else {
+    emitOpeningBrace();
+    emitShadowedArgs();
+    emitBody();
+    emitReturn();
+    emitClosingBrace();
   }
+  OS << "\n";
 
-  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
-    Record *R = RV[i];
-    if (std::find(EarlyDefs.begin(), EarlyDefs.end(), R->getName()) !=
-        EarlyDefs.end())
-      continue;
+  CurrentRecord = nullptr;
+  return OS.str();
+}
 
-    emitGuardedIntrinsic(OS, R, CurrentGuard, InGuard, EmittedMap);
-  }
+void Intrinsic::indexBody() {
+  CurrentRecord = R;
 
-  if (InGuard)
-    OS << "#endif\n\n";
+  initVariables();
+  emitBody();
+  OS.str("");
 
-  OS << "#undef __ai\n\n";
-  OS << "#endif /* __ARM_NEON_H */\n";
+  CurrentRecord = nullptr;
 }
 
-void NeonEmitter::emitGuardedIntrinsic(raw_ostream &OS, Record *R,
-                                       std::string &CurrentGuard, bool &InGuard,
-                                       StringMap<ClassKind> &EmittedMap) {
+//===----------------------------------------------------------------------===//
+// NeonEmitter implementation
+//===----------------------------------------------------------------------===//
+
+Intrinsic *NeonEmitter::getIntrinsic(StringRef Name, ArrayRef<Type> Types) {
+  // First, look up the name in the intrinsic map.
+  assert_with_loc(IntrinsicMap.find(Name.str()) != IntrinsicMap.end(),
+                  ("Intrinsic '" + Name + "' not found!").str());
+  std::vector<Intrinsic *> &V = IntrinsicMap[Name.str()];
+  std::vector<Intrinsic *> GoodVec;
+
+  // Create a string to print if we end up failing.
+  std::string ErrMsg = "looking up intrinsic '" + Name.str() + "(";
+  for (unsigned I = 0; I < Types.size(); ++I) {
+    if (I != 0)
+      ErrMsg += ", ";
+    ErrMsg += Types[I].str();
+  }
+  ErrMsg += ")'\n";
+  ErrMsg += "Available overloads:\n";
+
+  // Now, look through each intrinsic implementation and see if the types are
+  // compatible.
+  for (auto *I : V) {
+    ErrMsg += "  - " + I->getReturnType().str() + " " + I->getMangledName();
+    ErrMsg += "(";
+    for (unsigned A = 0; A < I->getNumParams(); ++A) {
+      if (A != 0)
+        ErrMsg += ", ";
+      ErrMsg += I->getParamType(A).str();
+    }
+    ErrMsg += ")\n";
 
-  std::string NewGuard = R->getValueAsString("ArchGuard");
-  if (NewGuard != CurrentGuard) {
-    if (InGuard)
-      OS << "#endif\n\n";
-    if (NewGuard.size())
-      OS << "#if " << NewGuard << '\n';
+    if (I->getNumParams() != Types.size())
+      continue;
 
-    CurrentGuard = NewGuard;
-    InGuard = NewGuard.size() != 0;
+    bool Good = true;
+    for (unsigned Arg = 0; Arg < Types.size(); ++Arg) {
+      if (I->getParamType(Arg) != Types[Arg]) {
+        Good = false;
+        break;
+      }
+    }
+    if (Good)
+      GoodVec.push_back(I);
   }
 
-  emitIntrinsic(OS, R, EmittedMap);
+  assert_with_loc(GoodVec.size() > 0,
+                  "No compatible intrinsic found - " + ErrMsg);
+  assert_with_loc(GoodVec.size() == 1, "Multiple overloads found - " + ErrMsg);
+
+  return GoodVec.front();
 }
 
-/// emitIntrinsic - Write out the arm_neon.h header file definitions for the
-/// intrinsics specified by record R checking for intrinsic uniqueness.
-void NeonEmitter::emitIntrinsic(raw_ostream &OS, Record *R,
-                                StringMap<ClassKind> &EmittedMap) {
-  std::string name = R->getValueAsString("Name");
+void NeonEmitter::createIntrinsic(Record *R,
+                                  SmallVectorImpl<Intrinsic *> &Out) {
+  std::string Name = R->getValueAsString("Name");
   std::string Proto = R->getValueAsString("Prototype");
   std::string Types = R->getValueAsString("Types");
+  Record *OperationRec = R->getValueAsDef("Operation");
+  bool CartesianProductOfTypes = R->getValueAsBit("CartesianProductOfTypes");
+  std::string Guard = R->getValueAsString("ArchGuard");
+  bool IsUnavailable = OperationRec->getValueAsBit("Unavailable");
+
+  // Set the global current record. This allows assert_with_loc to produce
+  // decent location information even when highly nested.
+  CurrentRecord = R;
 
-  SmallVector<StringRef, 16> TypeVec;
-  ParseTypes(R, Types, TypeVec);
+  ListInit *Body = OperationRec->getValueAsListInit("Ops");
 
-  OpKind kind = OpMap[R->getValueAsDef("Operand")->getName()];
+  std::vector<TypeSpec> TypeSpecs = TypeSpec::fromTypeSpecs(Types);
 
-  ClassKind classKind = ClassNone;
+  ClassKind CK = ClassNone;
   if (R->getSuperClasses().size() >= 2)
-    classKind = ClassMap[R->getSuperClasses()[1]];
-  if (classKind == ClassNone && kind == OpNone)
-    PrintFatalError(R->getLoc(), "Builtin has no class kind");
-
-  for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
-    if (kind == OpReinterpret) {
-      bool outQuad = false;
-      bool dummy = false;
-      (void)ClassifyType(TypeVec[ti], outQuad, dummy, dummy);
-      for (unsigned srcti = 0, srcte = TypeVec.size();
-           srcti != srcte; ++srcti) {
-        bool inQuad = false;
-        (void)ClassifyType(TypeVec[srcti], inQuad, dummy, dummy);
-        if (srcti == ti || inQuad != outQuad)
+    CK = ClassMap[R->getSuperClasses()[1]];
+
+  std::vector<std::pair<TypeSpec, TypeSpec>> NewTypeSpecs;
+  for (auto TS : TypeSpecs) {
+    if (CartesianProductOfTypes) {
+      Type DefaultT(TS, 'd');
+      for (auto SrcTS : TypeSpecs) {
+        Type DefaultSrcT(SrcTS, 'd');
+        if (TS == SrcTS ||
+            DefaultSrcT.getSizeInBits() != DefaultT.getSizeInBits())
           continue;
-        std::string s = GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[srcti],
-                                     OpCast, ClassS);
-        if (EmittedMap.count(s))
-          continue;
-        EmittedMap[s] = ClassS;
-        OS << s;
+        NewTypeSpecs.push_back(std::make_pair(TS, SrcTS));
       }
     } else {
-      std::string s =
-          GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[ti], kind, classKind);
-      if (EmittedMap.count(s)) {
-        errs() << "warning: duplicate definition: " << name
-               << " (type: " << TypeString('d', TypeVec[ti]) << ")\n";
-        continue;
-      }
-      EmittedMap[s] = classKind;
-      OS << s;
+      NewTypeSpecs.push_back(std::make_pair(TS, TS));
     }
   }
-  OS << "\n";
-}
-
-static unsigned RangeFromType(const char mod, StringRef typestr) {
-  // base type to get the type string for.
-  bool quad = false, dummy = false;
-  char type = ClassifyType(typestr, quad, dummy, dummy);
-  type = ModType(mod, type, quad, dummy, dummy, dummy, dummy, dummy);
 
-  switch (type) {
-    case 'c':
-      return (8 << (int)quad) - 1;
-    case 'h':
-    case 's':
-      return (4 << (int)quad) - 1;
-    case 'f':
-    case 'i':
-      return (2 << (int)quad) - 1;
-    case 'd':
-    case 'l':
-      return (1 << (int)quad) - 1;
-    case 'k':
-      return 0;
-    default:
-      PrintFatalError("unhandled type!");
-  }
-}
+  std::sort(NewTypeSpecs.begin(), NewTypeSpecs.end());
+  std::unique(NewTypeSpecs.begin(), NewTypeSpecs.end());
 
-static unsigned RangeScalarShiftImm(const char mod, StringRef typestr) {
-  // base type to get the type string for.
-  bool dummy = false;
-  char type = ClassifyType(typestr, dummy, dummy, dummy);
-  type = ModType(mod, type, dummy, dummy, dummy, dummy, dummy, dummy);
+  for (auto &I : NewTypeSpecs) {
+    Intrinsic *IT = new Intrinsic(R, Name, Proto, I.first, I.second, CK, Body,
+                                  *this, Guard, IsUnavailable);
 
-  switch (type) {
-    case 'c':
-      return 7;
-    case 'h':
-    case 's':
-      return 15;
-    case 'f':
-    case 'i':
-      return 31;
-    case 'd':
-    case 'l':
-      return 63;
-    case 'k':
-      return 127;
-    default:
-      PrintFatalError("unhandled type!");
+    IntrinsicMap[Name].push_back(IT);
+    Out.push_back(IT);
   }
-}
 
-/// Generate the ARM and AArch64 intrinsic range checking code for
-/// shift/lane immediates, checking for unique declarations.
-void
-NeonEmitter::genIntrinsicRangeCheckCode(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-  StringMap<OpKind> EmittedMap;
+  CurrentRecord = nullptr;
+}
 
-  // Generate the intrinsic range checking code for shift/lane immediates.
-  OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";
+/// genBuiltinsDef: Generate the BuiltinsARM.def and  BuiltinsAArch64.def
+/// declaration of builtins, checking for unique builtin declarations.
+void NeonEmitter::genBuiltinsDef(raw_ostream &OS,
+                                 SmallVectorImpl<Intrinsic *> &Defs) {
+  OS << "#ifdef GET_NEON_BUILTINS\n";
 
-  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
-    Record *R = RV[i];
+  // We only want to emit a builtin once, and we want to emit them in
+  // alphabetical order, so use a std::set.
+  std::set<std::string> Builtins;
 
-    OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
-    if (k != OpNone)
+  for (auto *Def : Defs) {
+    if (Def->hasBody())
       continue;
-
-    std::string name = R->getValueAsString("Name");
-    std::string Proto = R->getValueAsString("Prototype");
-    std::string Types = R->getValueAsString("Types");
-    std::string Rename = name + "@" + Proto;
-
     // Functions with 'a' (the splat code) in the type prototype should not get
     // their own builtin as they use the non-splat variant.
-    if (Proto.find('a') != std::string::npos)
+    if (Def->hasSplat())
       continue;
 
-    // Functions which do not have an immediate do not need to have range
-    // checking code emitted.
-    size_t immPos = Proto.find('i');
-    if (immPos == std::string::npos)
-      continue;
+    std::string S = "BUILTIN(__builtin_neon_" + Def->getMangledName() + ", \"";
 
-    SmallVector<StringRef, 16> TypeVec;
-    ParseTypes(R, Types, TypeVec);
-
-    if (R->getSuperClasses().size() < 2)
-      PrintFatalError(R->getLoc(), "Builtin has no class kind");
-
-    ClassKind ck = ClassMap[R->getSuperClasses()[1]];
-    if (!ProtoHasScalar(Proto))
-      ck = ClassB;
-
-    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
-      std::string namestr, shiftstr, rangestr;
-
-      if (R->getValueAsBit("isVCVT_N")) {
-        // VCVT between floating- and fixed-point values takes an immediate
-        // in the range [1, 32] for f32, or [1, 64] for f64.
-        ck = ClassB;
-        if (name.find("32") != std::string::npos)
-          rangestr = "l = 1; u = 31"; // upper bound = l + u
-        else if (name.find("64") != std::string::npos)
-          rangestr = "l = 1; u = 63";
-        else
-          PrintFatalError(R->getLoc(),
-              "Fixed point convert name should contains \"32\" or \"64\"");
-
-      } else if (R->getValueAsBit("isScalarShift")) {
-        // Right shifts have an 'r' in the name, left shifts do not.  Convert
-        // instructions have the same bounds and right shifts.
-        if (name.find('r') != std::string::npos ||
-            name.find("cvt") != std::string::npos)
-          rangestr = "l = 1; ";
-
-        unsigned upBound = RangeScalarShiftImm(Proto[immPos - 1], TypeVec[ti]);
-        // Narrow shift has half the upper bound
-        if (R->getValueAsBit("isScalarNarrowShift"))
-          upBound /= 2;
-
-        rangestr += "u = " + utostr(upBound);
-      } else if (R->getValueAsBit("isShift")) {
-        // Builtins which are overloaded by type will need to have their upper
-        // bound computed at Sema time based on the type constant.
-        shiftstr = ", true";
-
-        // Right shifts have an 'r' in the name, left shifts do not.
-        if (name.find('r') != std::string::npos)
-          rangestr = "l = 1; ";
-
-        rangestr += "u = RFT(TV" + shiftstr + ")";
-      } else if (ck == ClassB) {
-        // ClassB intrinsics have a type (and hence lane number) that is only
-        // known at runtime.
-        assert(immPos > 0 && "unexpected immediate operand");
-        if (R->getValueAsBit("isLaneQ"))
-          rangestr = "u = RFT(TV, false, true)";
-        else
-          rangestr = "u = RFT(TV, false, false)";
-      } else {
-        // The immediate generally refers to a lane in the preceding argument.
-        assert(immPos > 0 && "unexpected immediate operand");
-        rangestr =
-            "u = " + utostr(RangeFromType(Proto[immPos - 1], TypeVec[ti]));
-      }
-      // Make sure cases appear only once by uniquing them in a string map.
-      namestr = MangleName(name, TypeVec[ti], ck);
-      if (EmittedMap.count(namestr))
-        continue;
-      EmittedMap[namestr] = OpNone;
-
-      // Calculate the index of the immediate that should be range checked.
-      unsigned immidx = 0;
-
-      // Builtins that return a struct of multiple vectors have an extra
-      // leading arg for the struct return.
-      if (IsMultiVecProto(Proto[0]))
-        ++immidx;
-
-      // Add one to the index for each argument until we reach the immediate
-      // to be checked.  Structs of vectors are passed as multiple arguments.
-      for (unsigned ii = 1, ie = Proto.size(); ii != ie; ++ii) {
-        switch (Proto[ii]) {
-        default:
-          immidx += 1;
-          break;
-        case '2':
-        case 'B':
-          immidx += 2;
-          break;
-        case '3':
-        case 'C':
-          immidx += 3;
-          break;
-        case '4':
-        case 'D':
-          immidx += 4;
-          break;
-        case 'i':
-          ie = ii + 1;
-          break;
-        }
-      }
-      OS << "case NEON::BI__builtin_neon_";
-      OS << MangleName(name, TypeVec[ti], ck) << ": i = " << immidx << "; "
-         << rangestr << "; break;\n";
-    }
+    S += Def->getBuiltinTypeStr();
+    S += "\", \"n\")";
+
+    Builtins.insert(S);
   }
+
+  for (auto &S : Builtins)
+    OS << S << "\n";
   OS << "#endif\n\n";
 }
 
-struct OverloadInfo {
-  uint64_t Mask;
-  int PtrArgNum;
-  bool HasConstPtr;
-};
 /// Generate the ARM and AArch64 overloaded type checking code for
 /// SemaChecking.cpp, checking for unique builtin declarations.
-void
-NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-
-  // Generate the overloaded type checking code for SemaChecking.cpp
+void NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS,
+                                           SmallVectorImpl<Intrinsic *> &Defs) {
   OS << "#ifdef GET_NEON_OVERLOAD_CHECK\n";
 
   // We record each overload check line before emitting because subsequent Inst
   // definitions may extend the number of permitted types (i.e. augment the
   // Mask). Use std::map to avoid sorting the table by hash number.
+  struct OverloadInfo {
+    uint64_t Mask;
+    int PtrArgNum;
+    bool HasConstPtr;
+    OverloadInfo() : Mask(0ULL), PtrArgNum(0), HasConstPtr(false) {}
+  };
   std::map<std::string, OverloadInfo> OverloadMap;
-  typedef std::map<std::string, OverloadInfo>::iterator OverloadIterator;
 
-  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
-    Record *R = RV[i];
-    OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
-    if (k != OpNone)
+  for (auto *Def : Defs) {
+    // If the def has a body (that is, it has Operation DAGs), it won't call
+    // __builtin_neon_* so we don't need to generate a definition for it.
+    if (Def->hasBody())
       continue;
-
-    std::string Proto = R->getValueAsString("Prototype");
-    std::string Types = R->getValueAsString("Types");
-    std::string name = R->getValueAsString("Name");
-    std::string Rename = name + "@" + Proto;
-
     // Functions with 'a' (the splat code) in the type prototype should not get
     // their own builtin as they use the non-splat variant.
-    if (Proto.find('a') != std::string::npos)
+    if (Def->hasSplat())
       continue;
-
     // Functions which have a scalar argument cannot be overloaded, no need to
     // check them if we are emitting the type checking code.
-    if (ProtoHasScalar(Proto))
+    if (Def->protoHasScalar())
       continue;
 
-    SmallVector<StringRef, 16> TypeVec;
-    ParseTypes(R, Types, TypeVec);
-
-    if (R->getSuperClasses().size() < 2)
-      PrintFatalError(R->getLoc(), "Builtin has no class kind");
+    uint64_t Mask = 0ULL;
+    Type Ty = Def->getReturnType();
+    if (Def->getProto()[0] == 'v' || Def->getProto()[0] == 'f' ||
+        Def->getProto()[0] == 'F')
+      Ty = Def->getParamType(0);
+    if (Ty.isPointer())
+      Ty = Def->getParamType(1);
 
-    int si = -1, qi = -1;
-    uint64_t mask = 0, qmask = 0;
-    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
-      // Generate the switch case(s) for this builtin for the type validation.
-      bool quad = false, poly = false, usgn = false;
-      (void) ClassifyType(TypeVec[ti], quad, poly, usgn);
-
-      if (quad) {
-        qi = ti;
-        qmask |= 1ULL << GetNeonEnum(Proto, TypeVec[ti]);
-      } else {
-        si = ti;
-        mask |= 1ULL << GetNeonEnum(Proto, TypeVec[ti]);
-      }
-    }
+    Mask |= 1ULL << Ty.getNeonEnum();
 
-    // Check if the builtin function has a pointer or const pointer argument.
+    // Check if the function has a pointer or const pointer argument.
+    std::string Proto = Def->getProto();
     int PtrArgNum = -1;
     bool HasConstPtr = false;
-    for (unsigned arg = 1, arge = Proto.size(); arg != arge; ++arg) {
-      char ArgType = Proto[arg];
+    for (unsigned I = 0; I < Def->getNumParams(); ++I) {
+      char ArgType = Proto[I + 1];
       if (ArgType == 'c') {
         HasConstPtr = true;
-        PtrArgNum = arg - 1;
+        PtrArgNum = I;
         break;
       }
       if (ArgType == 'p') {
-        PtrArgNum = arg - 1;
+        PtrArgNum = I;
         break;
       }
     }
     // For sret builtins, adjust the pointer argument index.
-    if (PtrArgNum >= 0 && IsMultiVecProto(Proto[0]))
+    if (PtrArgNum >= 0 && Def->getReturnType().getNumVectors() > 1)
       PtrArgNum += 1;
 
+    std::string Name = Def->getName();
     // Omit type checking for the pointer arguments of vld1_lane, vld1_dup,
     // and vst1_lane intrinsics.  Using a pointer to the vector element
     // type with one of those operations causes codegen to select an aligned
     // load/store instruction.  If you want an unaligned operation,
     // the pointer argument needs to have less alignment than element type,
     // so just accept any pointer type.
-    if (name == "vld1_lane" || name == "vld1_dup" || name == "vst1_lane") {
+    if (Name == "vld1_lane" || Name == "vld1_dup" || Name == "vst1_lane") {
       PtrArgNum = -1;
       HasConstPtr = false;
     }
 
-    if (mask) {
-      std::pair<OverloadIterator, bool> I = OverloadMap.insert(std::make_pair(
-          MangleName(name, TypeVec[si], ClassB), OverloadInfo()));
-      OverloadInfo &Record = I.first->second;
-      if (!I.second)
-        assert(Record.PtrArgNum == PtrArgNum &&
-               Record.HasConstPtr == HasConstPtr);
-      Record.Mask |= mask;
-      Record.PtrArgNum = PtrArgNum;
-      Record.HasConstPtr = HasConstPtr;
-    }
-    if (qmask) {
-      std::pair<OverloadIterator, bool> I = OverloadMap.insert(std::make_pair(
-          MangleName(name, TypeVec[qi], ClassB), OverloadInfo()));
-      OverloadInfo &Record = I.first->second;
-      if (!I.second)
-        assert(Record.PtrArgNum == PtrArgNum &&
-               Record.HasConstPtr == HasConstPtr);
-      Record.Mask |= qmask;
-      Record.PtrArgNum = PtrArgNum;
-      Record.HasConstPtr = HasConstPtr;
+    if (Mask) {
+      std::string Name = Def->getMangledName();
+      OverloadMap.insert(std::make_pair(Name, OverloadInfo()));
+      OverloadInfo &OI = OverloadMap[Name];
+      OI.Mask |= Mask;
+      OI.PtrArgNum |= PtrArgNum;
+      OI.HasConstPtr = HasConstPtr;
     }
   }
 
-  for (OverloadIterator I = OverloadMap.begin(), E = OverloadMap.end(); I != E;
-       ++I) {
-    OverloadInfo &BuiltinOverloads = I->second;
-    OS << "case NEON::BI__builtin_neon_" << I->first << ": ";
-    OS << "mask = " << "0x" << utohexstr(BuiltinOverloads.Mask) << "ULL";
-    if (BuiltinOverloads.PtrArgNum >= 0)
-      OS << "; PtrArgNum = " << BuiltinOverloads.PtrArgNum;
-    if (BuiltinOverloads.HasConstPtr)
+  for (auto &I : OverloadMap) {
+    OverloadInfo &OI = I.second;
+
+    OS << "case NEON::BI__builtin_neon_" << I.first << ": ";
+    OS << "mask = 0x" << utohexstr(OI.Mask) << "ULL";
+    if (OI.PtrArgNum >= 0)
+      OS << "; PtrArgNum = " << OI.PtrArgNum;
+    if (OI.HasConstPtr)
       OS << "; HasConstPtr = true";
     OS << "; break;\n";
   }
-
   OS << "#endif\n\n";
 }
 
-/// genBuiltinsDef: Generate the BuiltinsARM.def and  BuiltinsAArch64.def
-/// declaration of builtins, checking for unique builtin declarations.
-void NeonEmitter::genBuiltinsDef(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+void
+NeonEmitter::genIntrinsicRangeCheckCode(raw_ostream &OS,
+                                        SmallVectorImpl<Intrinsic *> &Defs) {
+  OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";
 
-  // We want to emit the intrinsics in alphabetical order, so use the more
-  // expensive std::map to gather them together first.
-  std::map<std::string, OpKind> EmittedMap;
+  std::set<std::string> Emitted;
 
-  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
-    Record *R = RV[i];
-    OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
-    if (k != OpNone)
+  for (auto *Def : Defs) {
+    if (Def->hasBody())
       continue;
-
-    std::string Proto = R->getValueAsString("Prototype");
-    std::string name = R->getValueAsString("Name");
-    std::string Rename = name + "@" + Proto;
-
     // Functions with 'a' (the splat code) in the type prototype should not get
     // their own builtin as they use the non-splat variant.
-    if (Proto.find('a') != std::string::npos)
+    if (Def->hasSplat())
+      continue;
+    // Functions which do not have an immediate do not ned to have range
+    // checking
+    // code emitted.
+    if (!Def->hasImmediate())
+      continue;
+    if (Emitted.find(Def->getMangledName()) != Emitted.end())
       continue;
 
-    std::string Types = R->getValueAsString("Types");
-    SmallVector<StringRef, 16> TypeVec;
-    ParseTypes(R, Types, TypeVec);
+    std::string LowerBound, UpperBound;
 
-    if (R->getSuperClasses().size() < 2)
-      PrintFatalError(R->getLoc(), "Builtin has no class kind");
+    Record *R = Def->getRecord();
+    if (R->getValueAsBit("isVCVT_N")) {
+      // VCVT between floating- and fixed-point values takes an immediate
+      // in the range [1, 32) for f32 or [1, 64) for f64.
+      LowerBound = "1";
+      if (Def->getBaseType().getElementSizeInBits() == 32)
+        UpperBound = "31";
+      else
+        UpperBound = "63";
+    } else if (R->getValueAsBit("isScalarShift")) {
+      // Right shifts have an 'r' in the name, left shifts do not. Convert
+      // instructions have the same bounds and right shifts.
+      if (Def->getName().find('r') != std::string::npos ||
+          Def->getName().find("cvt") != std::string::npos)
+        LowerBound = "1";
+
+      UpperBound = utostr(Def->getReturnType().getElementSizeInBits() - 1);
+    } else if (R->getValueAsBit("isShift")) {
+      // Builtins which are overloaded by type will need to have thier upper
+      // bound computed at Sema time based on the type constant.
+
+      // Right shifts have an 'r' in the name, left shifts do not.
+      if (Def->getName().find('r') != std::string::npos)
+        LowerBound = "1";
+      UpperBound = "RFT(TV, true)";
+    } else if (Def->getClassKind(true) == ClassB) {
+      // ClassB intrinsics have a type (and hence lane number) that is only
+      // known at runtime.
+      if (R->getValueAsBit("isLaneQ"))
+        UpperBound = "RFT(TV, false, true)";
+      else
+        UpperBound = "RFT(TV, false, false)";
+    } else {
+      // The immediate generally refers to a lane in the preceding argument.
+      assert(Def->getImmediateIdx() > 0);
+      Type T = Def->getParamType(Def->getImmediateIdx() - 1);
+      UpperBound = utostr(T.getNumElements() - 1);
+    }
 
-    ClassKind ck = ClassMap[R->getSuperClasses()[1]];
+    // Calculate the index of the immediate that should be range checked.
+    unsigned Idx = Def->getNumParams();
+    if (Def->hasImmediate())
+      Idx = Def->getGeneratedParamIdx(Def->getImmediateIdx());
 
-    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
-      // Generate the declaration for this builtin, ensuring
-      // that each unique BUILTIN() macro appears only once in the output
-      // stream.
-      std::string bd = GenBuiltinDef(name, Proto, TypeVec[ti], ck);
-      if (EmittedMap.count(bd))
-        continue;
+    OS << "case NEON::BI__builtin_neon_" << Def->getMangledName() << ": "
+       << "i = " << Idx << ";";
+    if (LowerBound.size())
+      OS << " l = " << LowerBound << ";";
+    if (UpperBound.size())
+      OS << " u = " << UpperBound << ";";
+    OS << " break;\n";
 
-      EmittedMap[bd] = OpNone;
-    }
+    Emitted.insert(Def->getMangledName());
   }
 
-  // Generate BuiltinsNEON.
-  OS << "#ifdef GET_NEON_BUILTINS\n";
-
-  for (std::map<std::string, OpKind>::iterator I = EmittedMap.begin(),
-                                               E = EmittedMap.end();
-       I != E; ++I)
-    OS << I->first << "\n";
-
   OS << "#endif\n\n";
 }
 
@@ -3230,187 +2058,220 @@ void NeonEmitter::genBuiltinsDef(raw_ostream &OS) {
 void NeonEmitter::runHeader(raw_ostream &OS) {
   std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
 
+  SmallVector<Intrinsic *, 128> Defs;
+  for (auto *R : RV)
+    createIntrinsic(R, Defs);
+
   // Generate shared BuiltinsXXX.def
-  genBuiltinsDef(OS);
+  genBuiltinsDef(OS, Defs);
 
   // Generate ARM overloaded type checking code for SemaChecking.cpp
-  genOverloadTypeCheckCode(OS);
+  genOverloadTypeCheckCode(OS, Defs);
 
   // Generate ARM range checking code for shift/lane immediates.
-  genIntrinsicRangeCheckCode(OS);
+  genIntrinsicRangeCheckCode(OS, Defs);
 }
 
-/// GenTest - Write out a test for the intrinsic specified by the name and
-/// type strings, including the embedded patterns for FileCheck to match.
-static std::string GenTest(const std::string &name,
-                           const std::string &proto,
-                           StringRef outTypeStr, StringRef inTypeStr,
-                           bool isShift, bool isHiddenLOp,
-                           ClassKind ck, const std::string &InstName,
-                           bool isA64,
-                           std::string & testFuncProto) {
-  assert(!proto.empty() && "");
-  std::string s;
-
-  // Function name with type suffix
-  std::string mangledName = MangleName(name, outTypeStr, ClassS);
-  if (outTypeStr != inTypeStr) {
-    // If the input type is different (e.g., for vreinterpret), append a suffix
-    // for the input type.  String off a "Q" (quad) prefix so that MangleName
-    // does not insert another "q" in the name.
-    unsigned typeStrOff = (inTypeStr[0] == 'Q' ? 1 : 0);
-    StringRef inTypeNoQuad = inTypeStr.substr(typeStrOff);
-    mangledName = MangleName(mangledName, inTypeNoQuad, ClassS);
-  }
-
-  // todo: GenerateChecksForIntrinsic does not generate CHECK
-  // for aarch64 instructions yet
-  std::vector<std::string> FileCheckPatterns;
-  if (!isA64) {
-    GenerateChecksForIntrinsic(name, proto, outTypeStr, inTypeStr, ck, InstName,
-                               isHiddenLOp, FileCheckPatterns);
-    s+= "// CHECK_ARM: test_" + mangledName + "\n";
-  }
-  s += "// CHECK_AARCH64: test_" + mangledName + "\n";
-
-  // Emit the FileCheck patterns.
-  // If for any reason we do not want to emit a check, mangledInst
-  // will be the empty string.
-  if (FileCheckPatterns.size()) {
-    for (std::vector<std::string>::const_iterator i = FileCheckPatterns.begin(),
-                                                  e = FileCheckPatterns.end();
-         i != e;
-         ++i) {
-      s += "// CHECK_ARM: " + *i + "\n";
+/// run - Read the records in arm_neon.td and output arm_neon.h.  arm_neon.h
+/// is comprised of type definitions and function declarations.
+void NeonEmitter::run(raw_ostream &OS) {
+  OS << "/*===---- arm_neon.h - ARM Neon intrinsics "
+        "------------------------------"
+        "---===\n"
+        " *\n"
+        " * Permission is hereby granted, free of charge, to any person "
+        "obtaining "
+        "a copy\n"
+        " * of this software and associated documentation files (the "
+        "\"Software\"),"
+        " to deal\n"
+        " * in the Software without restriction, including without limitation "
+        "the "
+        "rights\n"
+        " * to use, copy, modify, merge, publish, distribute, sublicense, "
+        "and/or sell\n"
+        " * copies of the Software, and to permit persons to whom the Software "
+        "is\n"
+        " * furnished to do so, subject to the following conditions:\n"
+        " *\n"
+        " * The above copyright notice and this permission notice shall be "
+        "included in\n"
+        " * all copies or substantial portions of the Software.\n"
+        " *\n"
+        " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, "
+        "EXPRESS OR\n"
+        " * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF "
+        "MERCHANTABILITY,\n"
+        " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT "
+        "SHALL THE\n"
+        " * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR "
+        "OTHER\n"
+        " * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, "
+        "ARISING FROM,\n"
+        " * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER "
+        "DEALINGS IN\n"
+        " * THE SOFTWARE.\n"
+        " *\n"
+        " *===-----------------------------------------------------------------"
+        "---"
+        "---===\n"
+        " */\n\n";
+
+  OS << "#ifndef __ARM_NEON_H\n";
+  OS << "#define __ARM_NEON_H\n\n";
+
+  OS << "#if !defined(__ARM_NEON)\n";
+  OS << "#error \"NEON support not enabled\"\n";
+  OS << "#endif\n\n";
+
+  OS << "#include <stdint.h>\n\n";
+
+  // Emit NEON-specific scalar typedefs.
+  OS << "typedef float float32_t;\n";
+  OS << "typedef __fp16 float16_t;\n";
+
+  OS << "#ifdef __aarch64__\n";
+  OS << "typedef double float64_t;\n";
+  OS << "#endif\n\n";
+
+  // For now, signedness of polynomial types depends on target
+  OS << "#ifdef __aarch64__\n";
+  OS << "typedef uint8_t poly8_t;\n";
+  OS << "typedef uint16_t poly16_t;\n";
+  OS << "typedef uint64_t poly64_t;\n";
+  OS << "typedef __uint128_t poly128_t;\n";
+  OS << "#else\n";
+  OS << "typedef int8_t poly8_t;\n";
+  OS << "typedef int16_t poly16_t;\n";
+  OS << "#endif\n";
+
+  // Emit Neon vector typedefs.
+  std::string TypedefTypes(
+      "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfdQdPcQPcPsQPsPlQPl");
+  std::vector<TypeSpec> TDTypeVec = TypeSpec::fromTypeSpecs(TypedefTypes);
+
+  // Emit vector typedefs.
+  bool InIfdef = false;
+  for (auto &TS : TDTypeVec) {
+    bool IsA64 = false;
+    Type T(TS, 'd');
+    if (T.isDouble() || (T.isPoly() && T.isLong()))
+      IsA64 = true;
+
+    if (InIfdef && !IsA64) {
+      OS << "#endif\n";
+      InIfdef = false;
+    }
+    if (!InIfdef && IsA64) {
+      OS << "#ifdef __aarch64__\n";
+      InIfdef = true;
     }
+
+    if (T.isPoly())
+      OS << "typedef __attribute__((neon_polyvector_type(";
+    else
+      OS << "typedef __attribute__((neon_vector_type(";
+
+    Type T2 = T;
+    T2.makeScalar();
+    OS << utostr(T.getNumElements()) << "))) ";
+    OS << T2.str();
+    OS << " " << T.str() << ";\n";
   }
+  if (InIfdef)
+    OS << "#endif\n";
+  OS << "\n";
 
-  // Emit the start of the test function.
+  // Emit struct typedefs.
+  InIfdef = false;
+  for (unsigned NumMembers = 2; NumMembers <= 4; ++NumMembers) {
+    for (auto &TS : TDTypeVec) {
+      bool IsA64 = false;
+      Type T(TS, 'd');
+      if (T.isDouble() || (T.isPoly() && T.isLong()))
+        IsA64 = true;
+
+      if (InIfdef && !IsA64) {
+        OS << "#endif\n";
+        InIfdef = false;
+      }
+      if (!InIfdef && IsA64) {
+        OS << "#ifdef __aarch64__\n";
+        InIfdef = true;
+      }
 
-  testFuncProto = TypeString(proto[0], outTypeStr) + " test_" + mangledName + "(";
-  char arg = 'a';
-  std::string comma;
-  for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
-    // Do not create arguments for values that must be immediate constants.
-    if (proto[i] == 'i')
-      continue;
-    testFuncProto += comma + TypeString(proto[i], inTypeStr) + " ";
-    testFuncProto.push_back(arg);
-    comma = ", ";
-  }
-  testFuncProto += ")";
-
-  s+= testFuncProto;
-  s+= " {\n  ";
-
-  if (proto[0] != 'v')
-    s += "return ";
-  s += mangledName + "(";
-  arg = 'a';
-  for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
-    if (proto[i] == 'i') {
-      // For immediate operands, test the maximum value.
-      if (isShift)
-        s += "1"; // FIXME
-      else
-        // The immediate generally refers to a lane in the preceding argument.
-        s += utostr(RangeFromType(proto[i-1], inTypeStr));
-    } else {
-      s.push_back(arg);
+      char M = '2' + (NumMembers - 2);
+      Type VT(TS, M);
+      OS << "typedef struct " << VT.str() << " {\n";
+      OS << "  " << T.str() << " val";
+      OS << "[" << utostr(NumMembers) << "]";
+      OS << ";\n} ";
+      OS << VT.str() << ";\n";
+      OS << "\n";
     }
-    if ((i + 1) < e)
-      s += ", ";
   }
-  s += ");\n}\n\n";
-  return s;
-}
+  if (InIfdef)
+    OS << "#endif\n";
+  OS << "\n";
 
-/// Write out all intrinsic tests for the specified target, checking
-/// for intrinsic test uniqueness.
-void NeonEmitter::genTargetTest(raw_ostream &OS) {
-  StringMap<OpKind> EmittedMap;
-  std::string CurrentGuard = "";
-  bool InGuard = false;
+  OS << "#define __ai static inline __attribute__((__always_inline__, "
+        "__nodebug__))\n\n";
 
+  SmallVector<Intrinsic *, 128> Defs;
   std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
-    Record *R = RV[i];
-    std::string name = R->getValueAsString("Name");
-    std::string Proto = R->getValueAsString("Prototype");
-    std::string Types = R->getValueAsString("Types");
-    bool isShift = R->getValueAsBit("isShift");
-    std::string InstName = R->getValueAsString("InstName");
-    bool isHiddenLOp = R->getValueAsBit("isHiddenLInst");
-
-    std::string NewGuard = R->getValueAsString("ArchGuard");
-    if (NewGuard != CurrentGuard) {
-      if (InGuard)
-        OS << "#endif\n\n";
-      if (NewGuard.size())
-        OS << "#if " << NewGuard << '\n';
-
-      CurrentGuard = NewGuard;
-      InGuard = NewGuard.size() != 0;
-    }
-
-    SmallVector<StringRef, 16> TypeVec;
-    ParseTypes(R, Types, TypeVec);
+  for (auto *R : RV)
+    createIntrinsic(R, Defs);
+
+  for (auto *I : Defs)
+    I->indexBody();
+
+  std::stable_sort(
+      Defs.begin(), Defs.end(),
+      [](const Intrinsic *A, const Intrinsic *B) { return *A < *B; });
+
+  // Only emit a def when its requirements have been met.
+  // FIXME: This loop could be made faster, but it's fast enough for now.
+  bool MadeProgress = true;
+  std::string InGuard = "";
+  while (!Defs.empty() && MadeProgress) {
+    MadeProgress = false;
+
+    for (SmallVector<Intrinsic *, 128>::iterator I = Defs.begin();
+         I != Defs.end(); /*No step*/) {
+      bool DependenciesSatisfied = true;
+      for (auto *II : (*I)->getDependencies()) {
+        if (std::find(Defs.begin(), Defs.end(), II) != Defs.end())
+          DependenciesSatisfied = false;
+      }
+      if (!DependenciesSatisfied) {
+        // Try the next one.
+        ++I;
+        continue;
+      }
 
-    ClassKind ck = ClassMap[R->getSuperClasses()[1]];
-    OpKind kind = OpMap[R->getValueAsDef("Operand")->getName()];
-    if (kind == OpUnavailable)
-      continue;
-    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
-      if (kind == OpReinterpret) {
-        bool outQuad = false;
-        bool dummy = false;
-        (void)ClassifyType(TypeVec[ti], outQuad, dummy, dummy);
-        for (unsigned srcti = 0, srcte = TypeVec.size();
-             srcti != srcte; ++srcti) {
-          bool inQuad = false;
-          (void)ClassifyType(TypeVec[srcti], inQuad, dummy, dummy);
-          if (srcti == ti || inQuad != outQuad)
-            continue;
-          std::string testFuncProto;
-          std::string s = GenTest(name, Proto, TypeVec[ti], TypeVec[srcti],
-                                  isShift, isHiddenLOp, ck, InstName,
-                                  CurrentGuard.size(), testFuncProto);
-          if (EmittedMap.count(testFuncProto))
-            continue;
-          EmittedMap[testFuncProto] = kind;
-          OS << s << "\n";
-        }
-      } else {
-        std::string testFuncProto;
-        std::string s =
-            GenTest(name, Proto, TypeVec[ti], TypeVec[ti], isShift, isHiddenLOp,
-                    ck, InstName, CurrentGuard.size(), testFuncProto);
-        OS << s << "\n";
+      // Emit #endif/#if pair if needed.
+      if ((*I)->getGuard() != InGuard) {
+        if (!InGuard.empty())
+          OS << "#endif\n";
+        InGuard = (*I)->getGuard();
+        if (!InGuard.empty())
+          OS << "#if " << InGuard << "\n";
       }
+
+      // Actually generate the intrinsic code.
+      OS << (*I)->generate();
+
+      MadeProgress = true;
+      I = Defs.erase(I);
     }
   }
-
-  if (InGuard)
+  assert(Defs.empty() && "Some requirements were not satisfied!");
+  if (!InGuard.empty())
     OS << "#endif\n";
-}
-/// runTests - Write out a complete set of tests for all of the Neon
-/// intrinsics.
-void NeonEmitter::runTests(raw_ostream &OS) {
-  OS << "// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi "
-        "apcs-gnu\\\n"
-        "// RUN:  -target-cpu swift -ffreestanding -Os -S -o - %s\\\n"
-        "// RUN:  | FileCheck %s -check-prefix=CHECK_ARM\n"
-        "\n"
-        "// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \\\n"
-        "// RUN -target-feature +neon  -ffreestanding -S -o - %s \\\n"
-        "// RUN:  | FileCheck %s -check-prefix=CHECK_AARCH64\n"
-        "\n"
-        "// REQUIRES: long_tests\n"
-        "\n"
-        "#include <arm_neon.h>\n"
-        "\n";
-
-  genTargetTest(OS);
+
+  OS << "\n";
+  OS << "#undef __ai\n\n";
+  OS << "#endif /* __ARM_NEON_H */\n";
 }
 
 namespace clang {
@@ -3421,6 +2282,6 @@ void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).runHeader(OS);
 }
 void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS) {
-  NeonEmitter(Records).runTests(OS);
+  assert(0 && "Neon test generation no longer implemented!");
 }
 } // End namespace clang
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index 7e05496647d96ec3b64e92e5abbe651754bd0a80..78745f1aac60040bc3e32e5de418d5c2e364f6ac 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -61,6 +61,9 @@ void EmitClangCommentCommandList(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeon(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS);
 void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS);
+void EmitNeon2(RecordKeeper &Records, raw_ostream &OS);
+void EmitNeonSema2(RecordKeeper &Records, raw_ostream &OS);
+void EmitNeonTest2(RecordKeeper &Records, raw_ostream &OS);
 
 void EmitClangAttrDocs(RecordKeeper &Records, raw_ostream &OS);