diff --git a/contrib/cl-simd/LICENSE b/contrib/cl-simd/LICENSE new file mode 100644 index 000000000..453e19b08 --- /dev/null +++ b/contrib/cl-simd/LICENSE @@ -0,0 +1,25 @@ +(This is the MIT / X Consortium license as taken from + http://www.opensource.org/licenses/mit-license.html on or about + Monday; July 13, 2009) + +Copyright (c) 2010 by Alexander Gavrilov + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/contrib/cl-simd/README b/contrib/cl-simd/README new file mode 100644 index 000000000..0837cac86 --- /dev/null +++ b/contrib/cl-simd/README @@ -0,0 +1,8 @@ +This module implements SSE intrinsic functions for ECL and SBCL. + +NOTE: CURRENTLY THIS SHOULD BE CONSIDERED EXPERIMENTAL, AND + SUBJECT TO INCOMPATIBLE CHANGES IN A FUTURE RELEASE. + +Since the implementation is closely tied to the internals of +the compiler, it should normally be obtained exclusively via +the bundled contrib mechanism of the above implementations. diff --git a/contrib/cl-simd/cl-simd.asd b/contrib/cl-simd/cl-simd.asd new file mode 100644 index 000000000..2041a9233 --- /dev/null +++ b/contrib/cl-simd/cl-simd.asd @@ -0,0 +1,44 @@ +;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- +;;; +;;; Copyright (C) 2010, Alexander Gavrilov (angavrilov@gmail.com) +;;; +;;; This file defines the cl-simd ASDF system. +;;; +;;; Note that a completely independent definition +;;; is used to build the system as an ECL contrib. + +(defsystem :cl-simd + :version "1.0" + #+sb-building-contrib :pathname + #+sb-building-contrib #p"SYS:CONTRIB;CL-SIMD;" + :components + #+(and sbcl sb-sse-intrinsics) + ((:file "sse-package") + (:file "sbcl-core" :depends-on ("sse-package")) + (:file "sse-intrinsics" :depends-on ("sbcl-core")) + (:file "sbcl-functions" :depends-on ("sse-intrinsics")) + (:file "sbcl-arrays" :depends-on ("sbcl-functions")) + (:file "sse-array-defs" :depends-on ("sbcl-arrays")) + (:file "sse-utils" :depends-on ("sse-array-defs"))) + #+(and ecl sse2) + ((:file "sse-package") + (:file "ecl-sse-core" :depends-on ("sse-package")) + (:file "sse-intrinsics" :depends-on ("ecl-sse-core")) + (:file "sse-array-defs" :depends-on ("sse-intrinsics")) + (:file "ecl-sse-utils" :depends-on ("sse-intrinsics")) + (:file "sse-utils" :depends-on ("ecl-sse-utils"))) + #-(or (and sbcl sb-sse-intrinsics) + (and ecl sse2)) + ()) + +#+(or (and sbcl sb-sse-intrinsics) + (and ecl sse2)) +(defmethod perform :after ((o load-op) (c (eql (find-system :cl-simd)))) + (provide :cl-simd)) + +(defmethod perform ((o test-op) (c (eql (find-system :cl-simd)))) + #+(or (and sbcl sb-sse-intrinsics) + (and ecl sse2)) + (or (load (compile-file "test-sfmt.lisp")) + (error "test-sfmt failed"))) + diff --git a/contrib/cl-simd/cl-simd.texinfo b/contrib/cl-simd/cl-simd.texinfo new file mode 100644 index 000000000..c28fb927b --- /dev/null +++ b/contrib/cl-simd/cl-simd.texinfo @@ -0,0 +1,250 @@ +@node cl-simd +@section cl-simd +@cindex SSE2 Intrinsics +@cindex Intrinsics, SSE2 + +The @code{cl-simd} module provides access to SSE2 instructions +(which are nowadays supported by any CPU compatible with x86-64) +in the form of @emph{intrinsic functions}, similar to the way +adopted by modern C compilers. It also provides some lisp-specific +functionality, like setf-able intrinsics for accessing lisp arrays. + +When this module is loaded, it defines an @code{:sse2} feature, +which can be subsequently used for conditional compilation of +code that depends on it. Intrinsic functions are available from +the @code{sse} package. + +This API, with minor technical differences, is supported by both +ECL and SBCL (x86-64 only). + +@menu +* SSE pack types:: +* SSE array type:: +* Differences from C intrinsics:: +* Simple extensions:: +* Lisp array accessors:: +* Example:: +@end menu + +@node SSE pack types +@subsection SSE pack types + +The package defines and/or exports the following types to +represent 128-bit SSE register contents: + +@anchor{Type sse:sse-pack} +@deftp {Type} @somepkg{sse-pack,sse} @&optional item-type +The generic SSE pack type. +@end deftp + +@anchor{Type sse:int-sse-pack} +@deftp {Type} @somepkg{int-sse-pack,sse} +Same as @code{(sse-pack integer)}. +@end deftp + +@anchor{Type sse:float-sse-pack} +@deftp {Type} @somepkg{float-sse-pack,sse} +Same as @code{(sse-pack single-float)}. +@end deftp + +@anchor{Type sse:double-sse-pack} +@deftp {Type} @somepkg{double-sse-pack,sse} +Same as @code{(sse-pack double-float)}. +@end deftp + +Declaring variable types using the subtype appropriate +for your data is likely to lead to more efficient code +(especially on ECL). However, the compiler implicitly +casts between any subtypes of sse-pack when needed. + +Printed representation of SSE packs can be controlled +by binding @code{*sse-pack-print-mode*}: + +@anchor{Variable sse:*sse-pack-print-mode*} +@defvr {Variable} @somepkg{@earmuffs{sse-pack-print-mode},sse} +When set to one of @code{:int}, @code{:float} or +@code{:double}, specifies the way SSE packs are +printed. A @code{NIL} value (default) instructs +the implementation to make its best effort to +guess from the data and context. +@end defvr + +@node SSE array type +@subsection SSE array type + +@anchor{Type sse:sse-array} +@deftp {Type} @somepkg{sse-array,sse} element-type @&optional dimensions +Expands to a lisp array type that is efficiently +supported by AREF-like accessors. +It should be assumed to be a subtype of @code{SIMPLE-ARRAY}. +The type expander signals warnings or errors if it detects +that the element-type argument value is inappropriate or unsafe. +@end deftp + +@anchor{Function sse:make-sse-array} +@deffn {Function} @somepkg{make-sse-array,sse} dimensions @&key element-type initial-element displaced-to displaced-index-offset +Creates an object of type @code{sse-array}, or signals an error. +In non-displaced case ensures alignment of the beginning of data to +the 16-byte boundary. +Unlike @code{make-array}, the element type defaults to (unsigned-byte 8). +@end deffn + +On ECL this function supports full-featured displacement. +On SBCL it has to simulate it by sharing the underlying +data vector, and does not support nonzero index offset. + +@node Differences from C intrinsics +@subsection Differences from C intrinsics + +Intel Compiler, GCC and +@url{http://msdn.microsoft.com/en-us/library/y0dh78ez%28VS.80%29.aspx,MSVC} +all support the same set +of SSE intrinsics, originally designed by Intel. This +package generally follows the naming scheme of the C +version, with the following exceptions: + +@itemize +@item +Underscores are replaced with dashes, and the @code{_mm_} +prefix is removed in favor of packages. + +@item +The 'e' from @code{epi} is dropped because MMX is obsolete +and won't be supported. + +@item +@code{_si128} functions are renamed to @code{-pi} for uniformity +and brevity. The author has personally found this discrepancy +in the original C intrinsics naming highly jarring. + +@item +Comparisons are named using graphic characters, e.g. @code{<=-ps} +for @code{cmpleps}, or @code{/>-ps} for @code{cmpngtps}. In some +places the set of comparison functions is extended to cover the +full possible range. + +@item +Conversion functions are renamed to @code{convert-*-to-*} and +@code{truncate-*-to-*}. + +@item +A few functions are completely renamed: @code{cpu-mxcsr} (setf-able), +@code{cpu-pause}, @code{cpu-load-fence}, @code{cpu-store-fence}, +@code{cpu-memory-fence}, @code{cpu-clflush}, @code{cpu-prefetch-*}. +@end itemize + +In addition, foreign pointer access intrinsics have an additional +optional integer offset parameter to allow more efficient coding +of pointer deference, and the most common ones have been renamed +and made SETF-able: + +@itemize +@item +@code{mem-ref-ss}, @code{mem-ref-ps}, @code{mem-ref-aps} + +@item +@code{mem-ref-sd}, @code{mem-ref-pd}, @code{mem-ref-apd} + +@item +@code{mem-ref-pi}, @code{mem-ref-api}, @code{mem-ref-si64} +@end itemize + +(The @code{-ap*} version requires alignment.) + +@node Simple extensions +@subsection Simple extensions + +This module extends the set of basic intrinsics with the following +simple compound functions: + +@itemize +@item +@code{neg-ss}, @code{neg-ps}, @code{neg-sd}, @code{neg-pd}, +@code{neg-pi8}, @code{neg-pi16}, @code{neg-pi32}, @code{neg-pi64}: + +implement numeric negation of the corresponding data type. + +@item +@code{not-ps}, @code{not-pd}, @code{not-pi}: + +implement bitwise logical inversion. + +@item +@code{if-ps}, @code{if-pd}, @code{if-pi}: + +perform element-wise combining of two values based on a boolean +condition vector produced as a combination of comparison function +results through bitwise logical functions. + +The condition value must use all-zero bitmask for false, and +all-one bitmask for true as a value for each logical vector +element. The result is undefined if any other bit pattern is used. + +N.B.: these are @emph{functions}, so both branches of the +conditional are always evaluated. +@end itemize + +The module also provides symbol macros that expand into expressions +producing certain constants in the most efficient way: + +@itemize +@item +0.0-ps 0.0-pd 0-pi for zero + +@item +true-ps true-pd true-pi for all 1 bitmask + +@item +false-ps false-pd false-pi for all 0 bitmask (same as zero) +@end itemize + +@node Lisp array accessors +@subsection Lisp array accessors + +In order to provide better integration with ordinary lisp code, +this module implements a set of AREF-like memory accessors: + +@itemize +@item +@code{(ROW-MAJOR-)?AREF-PREFETCH-(T0|T1|T2|NTA)} for cache prefetch. + +@item +@code{(ROW-MAJOR-)?AREF-CLFLUSH} for cache flush. + +@item +@code{(ROW-MAJOR-)?AREF-[AS]?P[SDI]} for whole-pack read & write. +@end itemize + +(Where A = aligned; S = aligned streamed write.) + +These accessors can be used with any non-bit specialized +array or vector, without restriction on the precise element +type (although it should be declared at compile time to +ensure generation of the fastest code). + +Additional index bound checking is done to ensure that 16 +bytes of memory are accessible after the specified index. + +As an exception, ROW-MAJOR-AREF-PREFETCH-* does not do any +range checks at all, because the prefetch instructions +are officially safe to use with bad addresses. The +AREF-PREFETCH-* and *-CLFLUSH functions do only ordinary +index checks without the 16-byte extension. + +@node Example +@subsection Example + +This code processes several single-float arrays, storing +either the value of a*b, or c/3.5 into result, depending +on the sign of mode: + +@example +(loop for i from 0 below 128 by 4 + do (setf (aref-ps result i) + (if-ps (<-ps (aref-ps mode i) 0.0-ps) + (mul-ps (aref-ps a i) (aref-ps b i)) + (div-ps (aref-ps c i) (set1-ps 3.5))))) +@end example + +As already noted above, both branches of the if are always +evaluated. diff --git a/contrib/cl-simd/ecl-sse-core.lisp b/contrib/cl-simd/ecl-sse-core.lisp new file mode 100644 index 000000000..71e5006f2 --- /dev/null +++ b/contrib/cl-simd/ecl-sse-core.lisp @@ -0,0 +1,310 @@ +;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- +;;; +;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) +;;; +;;; This file defines macros for wrapping C-level SSE intrinsics. +;;; + +(in-package #:SSE) + +;;; The compound SSE pack type + +(deftype sse-pack (&optional item) + (ecase item + (* 'ext:sse-pack) + ((single-float float) 'float-sse-pack) + (double-float 'double-sse-pack) + (integer 'int-sse-pack))) + +;;; Helper macros and functions + +(defmacro typename-case (value &body clauses) + "Syntax: (case value &body clauses)" + `(cond ,@(mapcar (lambda (clause) + `((subtypep ,value ',(first clause)) + ,@(rest clause))) + clauses) + (t (error "Unsupported type name: ~S" ,value)))) + +(defun foreign-type-of (lt) + (typename-case lt + (nil :object) + (int-sse-pack :int-sse-pack) + (float-sse-pack :float-sse-pack) + (double-sse-pack :double-sse-pack) + (single-float :float) + (double-float :double) + (fixnum :fixnum) + #+uint32-t + (ext:integer32 :int32-t) + #+uint32-t + (ext:byte32 :uint32-t) + #+uint64-t + (ext:integer64 :int64-t) + #+uint64-t + (ext:byte64 :uint64-t) + (integer :fixnum))) + +(defun pointer-c-type-of (lt) + (typename-case lt + (nil "void") + (int-sse-pack "__m128i") + (float-sse-pack "float") + (double-sse-pack "double"))) + +;; Accept any real values for floating-point arguments: +(defun declaim-arg-type-of (lt) + (typename-case lt + ((or single-float double-float) 'real) + (ext:sse-pack 'ext:sse-pack) + (fixnum 'fixnum) + (t lt))) + +(defun inline-arg-type-of (lt) + (typename-case lt + ((or single-float double-float) 'c::fixnum-float) + (fixnum 'fixnum) + (t lt))) + +;; Constant expansion +(defun expand-constant (form env &optional chgp) + (let* ((mform (macroexpand form env)) + (cform (cond ((and (symbolp mform) (constantp mform)) + (symbol-value mform)) + (t mform)))) + (values cform (or chgp (not (eql cform form)))))) + +;; Macro helpers +(defun make-arg-name (index) + (intern (format nil "ARG~A" index))) + +(defun make-arg-nums (lst) + (loop for i from 0 below (length lst) collect i)) + +(defun wrap-ret-arg (core ret-type &optional ret-arg) + (cond ((eq ret-type nil) + (format nil "(~A,Cnil)" core)) + (ret-arg + (format nil "@~36R;(~A,#~36R)" ret-arg core ret-arg)) + (t core))) + +;; Constant generation +(defun make-pack-of-bin (bin-value &key (as 'int-sse-pack)) + (let* ((all (loop for i from 0 to 15 + for v = bin-value then (ash v -8) + collect (logand v 255))) + (pack (ext:vector-to-sse-pack + (make-array 16 :element-type '(unsigned-byte 8) :initial-contents all)))) + (if (eq as 'int-sse-pack) + pack + `(the ,as ,(ext:sse-pack-as-elt-type + pack (ecase as + (EXT:FLOAT-SSE-PACK 'single-float) + (EXT:DOUBLE-SSE-PACK 'double-float))))))) + +(defmacro def-inline (name mode arg-types ret-type call-str &rest flags) + `(eval-when (:compile-toplevel :load-toplevel) + (c::def-inline ',name ',mode ',arg-types ',ret-type ,call-str ,@flags))) + +(defmacro def-intrinsic (name arg-types ret-type c-name + &key (export t) ret-arg reorder-args immediate-args) + "Defines and exports an SSE intrinsic function with matching open-coding rules." + (let* ((anums (make-arg-nums arg-types)) + (asyms (mapcar #'make-arg-name anums)) + (aftypes (mapcar #'foreign-type-of arg-types)) + (rftype (foreign-type-of ret-type)) + (call-anums (if reorder-args (reverse anums) anums)) + (call-str (wrap-ret-arg (format nil "~A(~{#~36R~^,~})" c-name call-anums) ret-type ret-arg))) + `(progn + ,(if export `(export ',name)) + ,@(if immediate-args ; Generate a constantness verifier macro + `((define-compiler-macro ,name (&whole whole &environment env ,@asyms &aux chgp) + ,@(loop for (arg type) in immediate-args + collect `(let ((oldv ,arg)) + (multiple-value-setq (,arg chgp) (expand-constant oldv env chgp)) + (unless (typep ,arg ',type) + (c::cmperr "In call to ~A: Argument ~S~@[ = ~S~] is not a constant of type ~A" + ',name oldv (unless (eql oldv ,arg) ,arg) ',type)))) + (if chgp (list ',name ,@asyms) whole)))) + (proclaim '(ftype (function ,(mapcar #'declaim-arg-type-of arg-types) ,(or ret-type 'null)) ,name)) + ,@(if (null immediate-args) + `((defun ,name ,asyms + (declare (optimize (speed 0) (debug 0) (safety 1))) + (ffi:c-inline ,asyms ,aftypes ,rftype ,call-str :one-liner t)))) + (def-inline ,name :always ,(mapcar #'inline-arg-type-of arg-types) ,rftype + ,call-str :inline-or-warn t)))) + +(defmacro def-unary-intrinsic (name ret-type insn cost c-name + &key (arg-type ret-type) partial result-size immediate-arg) + (declare (ignore insn cost partial result-size)) + `(def-intrinsic ,name (,arg-type ,@(if immediate-arg (list immediate-arg))) + ,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg1 ,immediate-arg))))) + +(defmacro def-cvt-to-int32-intrinsic (name ret-type insn cost c-name + &key (arg-type ret-type) partial immediate-arg) + (declare (ignore insn cost partial)) + (assert (subtypep ret-type '(signed-byte 32))) + `(def-intrinsic ,name (,arg-type ,@(if immediate-arg (list immediate-arg))) + ,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg1 ,immediate-arg))))) + +(defmacro def-binary-intrinsic (name ret-type insn cost c-name + &key (x-type ret-type) (y-type ret-type) + commutative tags immediate-arg) + (declare (ignore insn cost commutative tags)) + `(def-intrinsic ,name (,x-type ,y-type ,@(if immediate-arg (list immediate-arg))) + ,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg2 ,immediate-arg))))) + +(defmacro def-sse-int-intrinsic (name int-type ret-type insn cost c-name + &key (arg-type ret-type) immediate-arg make-temporary) + (declare (ignore insn cost make-temporary)) + `(def-intrinsic ,name (,arg-type ,int-type ,@(if immediate-arg (list immediate-arg))) + ,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg2 ,immediate-arg))))) + +(defmacro %def-aref-intrinsic (tag val-type c-type reader writer &key (aux-args "") (bsize 16)) + "Defines and exports macros and functios that implement vectorized array access." + (let* ((rftype (foreign-type-of val-type)) + (aref-name (intern (format nil "AREF-~A" tag) *package*)) + (rm-aref-name (intern (format nil "ROW-MAJOR-AREF-~A" tag) *package*)) + (rm-aset-name (intern (format nil "ROW-MAJOR-ASET-~A" tag) *package*)) + (known-elt-types '((single-float "sf") + (double-float "df") + (ext:byte8 "b8") + (ext:integer8 "i8") + #+uint16-t (ext:byte16 "b16") + #+uint16-t (ext:integer16 "i16") + #+uint32-t (ext:byte32 "b32") + #+uint32-t (ext:integer32 "i32") + #+uint64-t (ext:byte64 "b64") + #+uint64-t (ext:integer64 "i64")))) + (flet ((fmtr (ptr-fmt &rest ptr-args) + (wrap-ret-arg (format nil "~A((~A*)~?~A)" + reader c-type ptr-fmt ptr-args aux-args) + val-type)) + (fmtw (ptr-fmt &rest ptr-args) + (wrap-ret-arg (format nil "~A((~A*)~?,#2)" + writer c-type ptr-fmt ptr-args) + val-type 2))) + `(progn + (export ',aref-name) + (export ',rm-aref-name) + (defmacro ,aref-name (array &rest indexes) + (let ((varr (gensym "ARR"))) + `(let ((,varr ,array)) + (declare (:read-only ,varr)) + (,',rm-aref-name ,varr (array-row-major-index ,varr ,@indexes))))) + (proclaim '(ftype (function (array fixnum) ,(or val-type 'null)) ,rm-aref-name)) + (defun ,rm-aref-name (array index) + (declare (optimize (speed 0) (debug 0) (safety 2))) + (ffi:c-inline (array index) (:object :int) ,rftype + ,(fmtr "ecl_row_major_ptr(#0,#1,~A)" bsize) + :one-liner t)) + ;; AREF + (def-inline ,rm-aref-name :always (t t) ,rftype + ,(fmtr "ecl_row_major_ptr(#0,fixint(#1),~A)" bsize) + :inline-or-warn t) + (def-inline ,rm-aref-name :always (t fixnum) ,rftype + ,(fmtr "ecl_row_major_ptr(#0,#1,~A)" bsize)) + ;; AREF unsafe + ,@(mapcar (lambda (spec) + `(def-inline ,rm-aref-name :unsafe ((array ,(first spec)) fixnum) ,rftype + ,(fmtr "(&(#0)->array.self.~A[#1])" (second spec)))) + known-elt-types) + ,@(if writer + `((define-setf-expander ,aref-name (array &rest indexes) + (let ((varr (gensym)) (vidx (gensym)) (vval (gensym))) + (values (list varr vidx) + (list array `(array-row-major-index ,varr ,@indexes)) + (list vval) + `(,',rm-aset-name ,varr ,vidx ,vval) `(,',rm-aref-name ,varr ,vidx)))) + (proclaim '(ftype (function (array fixnum ,(declaim-arg-type-of val-type)) ,val-type) ,rm-aset-name)) + (defun ,rm-aset-name (array index value) + (declare (optimize (speed 0) (debug 0) (safety 2))) + (prog1 value + (ffi:c-inline (array index value) (:object :int ,rftype) :void + ,(fmtw "ecl_row_major_ptr(#0,#1,~A)" bsize) + :one-liner t))) + (defsetf ,rm-aref-name ,rm-aset-name) + ;; ASET + (def-inline ,rm-aset-name :always (t t ,val-type) ,rftype + ,(fmtw "ecl_row_major_ptr(#0,fixint(#1),~A)" bsize) + :inline-or-warn t) + (def-inline ,rm-aset-name :always (t fixnum ,val-type) ,rftype + ,(fmtw "ecl_row_major_ptr(#0,#1,~A)" bsize)) + ;; ASET unsafe + ,@(mapcar (lambda (spec) + `(def-inline ,rm-aset-name :unsafe ((array ,(first spec)) fixnum ,val-type) ,rftype + ,(fmtw "(&(#0)->array.self.~A[#1])" (second spec)))) + known-elt-types))))))) + +(defmacro def-aref-intrinsic (tag val-type reader-fun writer-fun &key (check-bounds t)) + `(%def-aref-intrinsic ,tag ,val-type ,(pointer-c-type-of val-type) + ,(get reader-fun 'c-function-name) ,(get writer-fun 'c-function-name) + :bsize ,(ecase check-bounds + (t 16) ((nil) 0) (:no-gap 1)) + :aux-args ,(get reader-fun 'c-call-aux-args))) + +(defmacro def-mem-intrinsic (name c-type ret-type c-name &key (public t) + prefix-args (prefix-fmt "~@{#~36R,~}") + postfix-args (postfix-fmt "~@{,#~36R~}" pf-p) ret-arg) + "Defines and exports an SSE memory access intrinsic function with matching open-coding rules." + (let* ((anums (make-arg-nums (append prefix-args postfix-args))) + (asyms (mapcar #'make-arg-name anums)) + (prefix-nums (subseq anums 0 (length prefix-args))) + (postfix-nums (mapcar #'1+ (subseq anums (length prefix-args)))) + (prefix-syms (subseq asyms 0 (length prefix-args))) + (postfix-syms (subseq asyms (length prefix-args))) + (prefix-itypes (mapcar #'inline-arg-type-of prefix-args)) + (postfix-itypes (mapcar #'inline-arg-type-of postfix-args)) + (rftype (foreign-type-of ret-type)) + (ptr-idx (length prefix-args)) + (offset-idx (+ ptr-idx 1 (length postfix-args)))) + (flet ((fmt (ptr-text) + (wrap-ret-arg (format nil "~A(~?(~A*)~?~?)" + c-name prefix-fmt prefix-nums + c-type ptr-text (list ptr-idx offset-idx) + postfix-fmt postfix-nums) + ret-type ret-arg))) + `(progn + ,(when public `(export ',name)) + (eval-when (:compile-toplevel :load-toplevel :execute) + (setf (get ',name 'c-function-name) ,c-name) + ,(if (and pf-p (null postfix-args)) + `(setf (get ',name 'c-call-aux-args) ,postfix-fmt))) + (proclaim '(ftype (function (,@(mapcar #'declaim-arg-type-of prefix-args) si:foreign-data + ,@(mapcar #'declaim-arg-type-of postfix-args) &optional fixnum) ,ret-type) ,name)) + (defun ,name (,@prefix-syms ptr ,@postfix-syms &optional (offset 0)) + (declare (optimize (speed 0) (debug 0) (safety 1))) + (ffi:c-inline (,@prefix-syms ptr ,@postfix-syms offset) + (,@(mapcar #'foreign-type-of prefix-args) :pointer-void + ,@(mapcar #'foreign-type-of postfix-args) :int) ,rftype + ,(fmt "(((char*)#~A) + #~A)") + :one-liner t)) + (def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes) ,rftype + ,(fmt "ecl_to_pointer(#~A)") + :inline-or-warn t) + (def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes t) ,rftype + ,(fmt "(((char*)ecl_to_pointer(#~A)) + fixint(#~A))")) + (def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes fixnum) ,rftype + ,(fmt "(((char*)ecl_to_pointer(#~A)) + #~A)")) + (def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes) ,rftype + ,(fmt "(#~A)->foreign.data")) + (def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes t) ,rftype + ,(fmt "(((char*)(#~A)->foreign.data) + fix(#~A))")) + (def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes fixnum) ,rftype + ,(fmt "(((char*)(#~A)->foreign.data) + #~A)")))))) + +(defmacro def-load-intrinsic (name ret-type insn c-name &key register-arg tags size postfix-fmt) + (declare (ignore insn tags size)) + `(def-mem-intrinsic ,name ,(pointer-c-type-of ret-type) ,ret-type ,c-name + :prefix-args ,(if register-arg (list ret-type)) + :postfix-fmt ,(or postfix-fmt ""))) + +(defmacro def-store-intrinsic (name ret-type insn c-name &key setf-name) + (declare (ignore insn)) + `(progn + (def-mem-intrinsic ,name ,(pointer-c-type-of ret-type) ,ret-type ,c-name + :public ,(not setf-name) :postfix-args (,ret-type) :ret-arg 1) + ,(if setf-name + `(defsetf ,setf-name (pointer &optional (offset 0)) (value) + `(,',name ,pointer ,value ,offset))))) + diff --git a/contrib/cl-simd/ecl-sse-utils.lisp b/contrib/cl-simd/ecl-sse-utils.lisp new file mode 100644 index 000000000..1bbea7770 --- /dev/null +++ b/contrib/cl-simd/ecl-sse-utils.lisp @@ -0,0 +1,398 @@ +;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- +;;; +;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) +;;; +;;; This file defines some extensions to the base intrinsic set, +;;; and other utility functions. +;;; + +(in-package #:SSE) + +;;; Helper macros and functions +(eval-when (:compile-toplevel :load-toplevel :execute) + ;; Try using a matching inverse function name + (defun lookup-flip (arg pairs &key no-reverse) + (and (consp arg) + (let ((fix (or (cdr (assoc (first arg) pairs)) + (unless no-reverse + (car (rassoc (first arg) pairs)))))) + (cond ((eq fix :identity) + (assert (null (cddr arg))) + (second arg)) + (fix + `(,fix ,@(rest arg))) + (t nil))))) + ;; Macroexpand, plus compiler expand some specific names + (defun expand-condition (form env) + (setq form (macroexpand form env)) + (loop while (and (consp form) + (symbolp (first form)) + (get (first form) 'expand-in-condition)) + do (setq form (c::cmp-expand-macro (compiler-macro-function (first form)) + form env))) + form) + ;; Checks if the form is an unary call + (defun is-unary? (form op) + (and (consp form) + (eq (first form) op) + (null (cddr form)))) + ;; IF-style function expander + (defun expand-if-macro (condition then-value else-value env if-f not-f or-f and-f andnot-f type-name zero-val &key flip) + (let* ((condition (expand-condition condition env)) + (then-value (macroexpand then-value env)) + (else-value (macroexpand else-value env)) + (then-zero? (equal then-value zero-val)) + (else-zero? (equal else-value zero-val))) + (cond ((is-unary? condition not-f) + (expand-if-macro (second condition) else-value then-value + env if-f not-f or-f and-f andnot-f type-name zero-val + :flip (not flip))) + ((and then-zero? else-zero?) + zero-val) + (then-zero? + `(,andnot-f ,condition ,else-value)) + (else-zero? + `(,and-f ,condition ,then-value)) + (t + (let* ((csym (gensym)) + (args `((,and-f ,csym ,then-value) + (,andnot-f ,csym ,else-value)))) + `(let ((,csym ,condition)) + (declare (type ,type-name ,csym) + (:read-only ,csym)) + (,or-f ,@(if flip (reverse args) args))))))))) + +(defmacro def-utility (name arg-types ret-type expansion &key expand-args expand-in-condition) + "Defines and exports a function & compiler macro with the specified expansion." + (let* ((anames (mapcar #'make-arg-name (make-arg-nums arg-types)))) + `(progn + (export ',name) + (eval-when (:compile-toplevel :load-toplevel) + ,@(if expand-in-condition + `((setf (get ',name 'expand-in-condition) t))) + (define-compiler-macro ,name (&environment env ,@anames) + (declare (ignorable env)) + ,@(loop for arg in (if (eq expand-args t) anames expand-args) + collect `(setq ,arg (macroexpand ,arg env))) + ,expansion)) + (proclaim '(ftype (function ,(mapcar #'declaim-arg-type-of arg-types) ,ret-type) ,name)) + (defun ,name ,anames + (declare (optimize (speed 0) (debug 0) (safety 1))) + (let ,(mapcar #'list anames anames) + (declare ,@(loop for an in anames and at in arg-types + collect `(type ,at ,an))) + ;; Depends on the compiler macro being expanded: + (,name ,@anames)))))) + +(defmacro def-if-function (name type-name postfix) + `(def-utility ,name (,type-name ,type-name ,type-name) ,type-name + (expand-if-macro arg0 arg1 arg2 env + ',name + ',(intern (format nil "NOT-~A" postfix)) + ',(intern (format nil "OR-~A" postfix)) + ',(intern (format nil "AND-~A" postfix)) + ',(intern (format nil "ANDNOT-~A" postfix)) + ',type-name + '(,(intern (format nil "SETZERO-~A" postfix)))))) + +;;; Aligned array allocation + +(deftype sse-array (elt-type &optional dims) + "Type of arrays efficiently accessed by SSE aref intrinsics and returned by make-sse-array. +Should be assumed to be SIMPLE-ARRAY, except that displacing with MAKE-SSE-ARRAY is allowed." + (when (eq elt-type '*) + (c::cmperr "SSE-ARRAY must have a specific element type.")) + (let ((upgraded (upgraded-array-element-type elt-type))) + (when (member upgraded '(t bit)) + (c::cmperr "Invalid SSE-ARRAY element type: ~S" elt-type)) + (unless (subtypep upgraded elt-type) + (c::cmpwarn "SSE-ARRAY element type ~S has been upgraded to ~S" elt-type upgraded)) + `(array ,upgraded ,dims))) + +(defun make-sse-array (dimensions &rest args &key (element-type '(unsigned-byte 8)) displaced-to &allow-other-keys) + "Allocates an SSE-ARRAY aligned to the 16-byte boundary. May flatten displacement chains for performance reasons." + (if displaced-to + (apply #'make-array dimensions args) + (multiple-value-bind (elt-size adj-type) + (array-element-type-byte-size element-type) + (when (eq adj-type t) + (error "Cannot use element type T with SSE.")) + (sys::remf args :element-type) + (let* ((full-size (if (numberp dimensions) + dimensions + (reduce #'* dimensions))) + (padded-size (+ full-size (ceiling 15 elt-size))) + (array (apply #'make-array padded-size :element-type adj-type args)) + (misalign (ffi:c-inline (array) (:object) :int + "(((unsigned long)(#0)->array.self.b8) & 15)" + :one-liner t)) + (offset (/ (if (> misalign 0) (- 16 misalign) 0) elt-size))) + (make-array dimensions :element-type element-type + :displaced-to array :displaced-index-offset offset))))) + +;;; Single-float tools + +;; Constants + +(defmacro set-true-ss () + (load-time-value (make-pack-of-bin #xFFFFFFFF :as 'float-sse-pack))) + +(defmacro set-true-ps () + (load-time-value (make-pack-of-bin -1 :as 'float-sse-pack))) + +(eval-when (:compile-toplevel :load-toplevel) + (define-symbol-macro 0.0-ps (setzero-ps)) + + (define-symbol-macro true-ss (set-true-ss)) + (define-symbol-macro false-ss (setzero-ps)) + + (define-symbol-macro true-ps (set-true-ps)) + (define-symbol-macro false-ps (setzero-ps))) + +;; Bitwise if + +(def-if-function if-ps float-sse-pack #:ps) + +;; Arithmetic negation (xor with negative zero) + +(def-utility neg-ss (float-sse-pack) float-sse-pack + `(xor-ps ,arg0 ,(load-time-value (make-pack-of-bin #x80000000 :as 'float-sse-pack)))) + +(def-utility neg-ps (float-sse-pack) float-sse-pack + `(xor-ps ,arg0 ,(load-time-value + (make-pack-of-bin #x80000000800000008000000080000000 :as 'float-sse-pack)))) + +;; Logical inversion + +(def-utility not-ps (float-sse-pack) float-sse-pack + (or (lookup-flip arg0 '((=-ps . /=-ps) + (<-ps . /<-ps) + (<=-ps . /<=-ps) + (>-ps . />-ps) + (>=-ps . />=-ps) + (cmpord-ps . cmpunord-ps) + (not-ps . :identity))) + `(xor-ps ,arg0 true-ps)) + :expand-args t) + +;; Shuffle + +(defun shuffle-ps (x y mask) + (declare (optimize (speed 0) (debug 0) (safety 1)) + (type t x y mask)) + (check-type x sse-pack) + (check-type y sse-pack) + (check-type mask (unsigned-byte 8)) + (ffi:c-inline (x y mask) (:object :object :int) :float-sse-pack + "_mm_setr_ps( + (#0)->sse.data.sf[(#2)&3], + (#0)->sse.data.sf[((#2)>>2)&3], + (#1)->sse.data.sf[((#2)>>4)&3], + (#1)->sse.data.sf[((#2)>>6)&3] + )" :one-liner t)) + +;;; Double-float tools + +;; Constants + +(defmacro set-true-sd () + (load-time-value (make-pack-of-bin #xFFFFFFFFFFFFFFFF :as 'double-sse-pack))) + +(defmacro set-true-pd () + (load-time-value (make-pack-of-bin -1 :as 'double-sse-pack))) + +(eval-when (:compile-toplevel :load-toplevel) + (define-symbol-macro 0.0-pd (setzero-pd)) + + (define-symbol-macro true-sd (set-true-sd)) + (define-symbol-macro false-sd (setzero-pd)) + + (define-symbol-macro true-pd (set-true-pd)) + (define-symbol-macro false-pd (setzero-pd))) + +;; Bitwise if + +(def-if-function if-pd double-sse-pack #:pd) + +;; Arithmetic negation (xor with negative zero) + +(def-utility neg-sd (double-sse-pack) double-sse-pack + `(xor-pd ,arg0 + ,(load-time-value + (make-pack-of-bin #x8000000000000000 :as 'double-sse-pack)))) + +(def-utility neg-pd (double-sse-pack) double-sse-pack + `(xor-pd ,arg0 + ,(load-time-value + (make-pack-of-bin #x80000000000000008000000000000000 :as 'double-sse-pack)))) + +;; Logical inversion + +(def-utility not-pd (double-sse-pack) double-sse-pack + (or (lookup-flip arg0 '((=-pd . /=-pd) + (<-pd . /<-pd) + (<=-pd . /<=-pd) + (>-pd . />-pd) + (>=-pd . />=-pd) + (cmpord-pd . cmpunord-pd) + (not-pd . :identity))) + `(xor-pd ,arg0 true-pd)) + :expand-args t) + +;; Shuffle + +(defun shuffle-pd (x y mask) + (declare (optimize (speed 0) (debug 0) (safety 1)) + (type t x y mask)) + (check-type x sse-pack) + (check-type y sse-pack) + (check-type mask (unsigned-byte 2)) + (ffi:c-inline (x y mask) (:object :object :int) :double-sse-pack + "_mm_setr_pd( + (#0)->sse.data.df[(#2)&1], + (#1)->sse.data.df[((#2)>>1)&1] + )" :one-liner t)) + +;;; Integer tools + +;; Constants + +(defmacro set-true-pi () + (load-time-value (make-pack-of-bin -1 :as 'int-sse-pack))) + +(eval-when (:compile-toplevel :load-toplevel) + (define-symbol-macro 0-pi (setzero-pi)) + + (define-symbol-macro true-pi (set-true-pi)) + (define-symbol-macro false-pi (setzero-pi))) + +;; Bitwise if + +(def-if-function if-pi float-sse-pack #:pi) + +;; Arithmetic negation (subtract from 0) + +(macrolet ((frob (name subf) + `(def-utility ,name (int-sse-pack) int-sse-pack + `(,',subf (setzero-pi) ,arg0)))) + (frob neg-pi8 sub-pi8) + (frob neg-pi16 sub-pi16) + (frob neg-pi32 sub-pi32) + (frob neg-pi64 sub-pi64)) + +;; Logical inversion + +(def-utility not-pi (int-sse-pack) int-sse-pack + (or (lookup-flip arg0 '((<=-pi8 . >-pi8) + (<=-pi16 . >-pi16) + (<=-pi32 . >-pi32) + (>=-pi8 . <-pi8) + (>=-pi16 . <-pi16) + (>=-pi32 . <-pi32) + (/=-pi8 . =-pi8) + (/=-pi16 . =-pi16) + (/=-pi32 . =-pi32) + (not-pi . :identity)) + :no-reverse t) + `(xor-pi ,arg0 true-pi)) + :expand-args t) + +(macrolet ((frob (name code) + `(def-utility ,name (int-sse-pack int-sse-pack) int-sse-pack + ,code + :expand-in-condition t))) + + (frob <=-pi8 `(not-pi (>-pi8 ,arg0 ,arg1))) + (frob <=-pi16 `(not-pi (>-pi16 ,arg0 ,arg1))) + (frob <=-pi32 `(not-pi (>-pi32 ,arg0 ,arg1))) + + (frob >=-pi8 `(not-pi (<-pi8 ,arg0 ,arg1))) + (frob >=-pi16 `(not-pi (<-pi16 ,arg0 ,arg1))) + (frob >=-pi32 `(not-pi (<-pi32 ,arg0 ,arg1))) + + (frob /=-pi8 `(not-pi (=-pi8 ,arg0 ,arg1))) + (frob /=-pi16 `(not-pi (=-pi16 ,arg0 ,arg1))) + (frob /=-pi32 `(not-pi (=-pi32 ,arg0 ,arg1)))) + +;; Shifts + +(defun slli-pi (x shift) + (declare (optimize (speed 0) (debug 0) (safety 1)) + (type t x shift)) + (check-type x sse-pack) + (check-type shift (unsigned-byte 8)) + (ffi:c-inline (x shift) (:object :int) :object + "cl_object rv = ecl_make_int_sse_pack(_mm_setzero_si128()); + unsigned bshift=(#1), i; + for (i = 0; i + bshift < 16; i++) + rv->sse.data.b8[i+bshift] = (#0)->sse.data.b8[i]; + @(return) = rv;")) + +(defun srli-pi (x shift) + (declare (optimize (speed 0) (debug 0) (safety 1)) + (type t x shift)) + (check-type x sse-pack) + (check-type shift (unsigned-byte 8)) + (ffi:c-inline (x shift) (:object :int) :object + "cl_object rv = ecl_make_int_sse_pack(_mm_setzero_si128()); + int bshift=(#1), i; + for (i = 16 - bshift - 1; i >= 0; i--) + rv->sse.data.b8[i] = (#0)->sse.data.b8[i+bshift]; + @(return) = rv;")) + +;; Extract & insert + +(defun extract-pi16 (x index) + (declare (optimize (speed 0) (debug 0) (safety 1)) + (type t x index)) + (check-type x sse-pack) + (check-type index (unsigned-byte 8)) + (ffi:c-inline (x index) (:object :int) :fixnum + "*((unsigned short*)&(#0)->sse.data.b8[((#1)&3)*2])" + :one-liner t)) + +(defun insert-pi16 (x ival index) + (declare (optimize (speed 0) (debug 0) (safety 1)) + (type t x ival index)) + (check-type x sse-pack) + (check-type index (unsigned-byte 8)) + (ffi:c-inline (x ival index) (:int-sse-pack :int :int) :object + "cl_object rv = ecl_make_int_sse_pack(#0); + *((unsigned short*)&rv->sse.data.b8[((#2)&3)*2]) = (unsigned short)(#1); + @(return) = rv;")) + +;; Shuffles + +(defun shuffle-pi32 (x mask) + (declare (optimize (speed 0) (debug 0) (safety 1)) + (type t x mask)) + (check-type x sse-pack) + (check-type mask (unsigned-byte 8)) + (ffi:c-inline (x mask) (:object :int) :int-sse-pack + "unsigned *pd = (unsigned*)(#0)->sse.data.b8; + @(return) = _mm_setr_epi32(pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[((#1)>>6)&3]);")) + +(defun shufflelo-pi16 (x mask) + (declare (optimize (speed 0) (debug 0) (safety 1)) + (type t x mask)) + (check-type x sse-pack) + (check-type mask (unsigned-byte 8)) + (ffi:c-inline (x mask) (:object :int) :int-sse-pack + "unsigned short *pd = (unsigned short*)(#0)->sse.data.b8; + @(return) = _mm_setr_epi16( + pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[(((#1)>>6)&3)], + pd[4], pd[5], pd[6], pd[7] + );")) + +(defun shufflehi-pi16 (x mask) + (declare (optimize (speed 0) (debug 0) (safety 1)) + (type t x mask)) + (check-type x sse-pack) + (check-type mask (unsigned-byte 8)) + (ffi:c-inline (x mask) (:object :int) :int-sse-pack + "unsigned short *pb = (unsigned short*)(#0)->sse.data.b8, *pd = pb+4; + @(return) = _mm_setr_epi16( + pb[0], pb[1], pb[2], pb[3], + pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[(((#1)>>6)&3)] + );")) + diff --git a/contrib/cl-simd/sbcl-arrays.lisp b/contrib/cl-simd/sbcl-arrays.lisp new file mode 100644 index 000000000..a2bff1b36 --- /dev/null +++ b/contrib/cl-simd/sbcl-arrays.lisp @@ -0,0 +1,280 @@ +;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- +;;; +;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) +;;; +;;; This file contains the groundwork for vectorized +;;; array access intrinsics. +;;; + +(in-package #:SSE) + +;; SSE array element size calculation + +(eval-when (:compile-toplevel :load-toplevel :execute) + (defun sse-elt-shift-from-saetp (info) + (and info + (subtypep (saetp-specifier info) 'number) + (not (saetp-fixnum-p info)) + (case (saetp-n-bits info) + (8 0) (16 1) (32 2) (64 3) (128 4))))) + +(defglobal %%size-shift-table%% + (let ((arr (make-array (1+ widetag-mask) :initial-element nil))) + (loop + for info across *specialized-array-element-type-properties* + for shift = (sse-elt-shift-from-saetp info) + when shift + do (setf (svref arr (saetp-typecode info)) shift)) + arr) + "A table of element size shifts for supported SSE array types.") + +(declaim (inline sse-elt-shift-of) + (ftype (function (t) (integer 0 4)) sse-elt-shift-of)) + +(defun sse-elt-shift-of (obj) + "Returns the SSE element size shift for the given object, +or fails if it is not a valid SSE vector." + (declare (optimize (safety 0))) + (the (integer 0 4) + (or (svref %%size-shift-table%% + (if (sb-vm::%other-pointer-p obj) + (%other-pointer-widetag obj) + 0)) + (error 'type-error + :datum obj + :expected-type 'sse-array)))) + +;;; Type and allocation + +(deftype sse-array (&optional (elt-type '* et-p) dims) + "Type of arrays efficiently accessed by SSE aref intrinsics and returned by make-sse-array. +Should be assumed to be SIMPLE-ARRAY, except that displacing with MAKE-SSE-ARRAY is allowed." + (if (eq elt-type '*) + (progn + (when et-p + (error "SSE-ARRAY must have a specific element type.")) + `(simple-array * ,dims)) + (let* ((upgraded (upgraded-array-element-type elt-type)) + (shift (sse-elt-shift-from-saetp (find-saetp upgraded)))) + (when (null shift) + (error "Invalid SSE-ARRAY element type: ~S" elt-type)) + (unless (subtypep upgraded elt-type) + (warn "SSE-ARRAY element type ~S has been upgraded to ~S" elt-type upgraded)) + `(simple-array ,upgraded ,dims)))) + +(defun make-sse-array (dimensions &key (element-type '(unsigned-byte 8)) (initial-element nil ie-p) displaced-to (displaced-index-offset 0)) + "Allocates an SSE-ARRAY aligned to the 16-byte boundary. Flattens displacement chains for performance reasons." + (let* ((upgraded (upgraded-array-element-type element-type)) + (shift (sse-elt-shift-from-saetp (find-saetp upgraded)))) + (when (null shift) + (error "Invalid SSE-ARRAY element type: ~S" element-type)) + (if displaced-to + ;; Fake displacement by allocating a simple-array header + (let* ((dimensions (if (listp dimensions) dimensions (list dimensions))) + (rank (length dimensions)) + (count (reduce #'* dimensions))) + (unless (subtypep element-type (array-element-type displaced-to)) + (error "can't displace an array of type ~S into another of type ~S" + element-type (array-element-type displaced-to))) + (with-array-data ((data displaced-to) + (start displaced-index-offset) + (end)) + (unless (= start 0) + (error "SSE-ARRAY does not support displaced index offset.")) + (unless (<= count end) + (array-bounding-indices-bad-error data start count)) + (if (= rank 1) + (progn + (when (< count end) + (warn "SSE-ARRAY displaced size extended to the full length of the vector.")) + data) + (let ((new-array (make-array-header simple-array-widetag rank))) + (set-array-header new-array data count nil 0 dimensions nil t))))) + ;; X86-64 vectors are already aligned to 16 bytes + (apply #'make-array dimensions :element-type upgraded + (if ie-p (list :initial-element initial-element)))))) + +;;; AREF intrinsic definition helpers + +(defconstant +vector-data-fixup+ + (- (* vector-data-offset n-word-bytes) other-pointer-lowtag) + "Offset from a tagged vector pointer to its data") + +(defmacro array-data-expr (array-var &optional is-vector) + (ecase is-vector + (:yes array-var) + (:no `(%array-data-vector ,array-var)) + ((nil) + `(if (array-header-p ,array-var) + (%array-data-vector ,array-var) + ,array-var)))) + +;; Depends on the vector-length field being in the same place +;; as the array fill pointer, which for simple-array is equal +;; to the total size. +(defknown %sse-array-size (simple-array fixnum) array-total-size (flushable always-translatable)) + +(define-vop (%sse-array-size/0) + (:translate %sse-array-size) + (:args (array :scs (descriptor-reg))) + (:arg-types * (:constant (integer 0 0))) + (:info gap) + (:ignore gap) + (:policy :fast-safe) + (:results (result :scs (any-reg))) + (:result-types tagged-num) + (:generator 3 + (loadw result array vector-length-slot other-pointer-lowtag))) + +(define-vop (%sse-array-size %sse-array-size/0) + (:arg-types * (:constant (integer 1 16))) + (:ignore) + (:temporary (:sc any-reg) tmp) + (:generator 8 + (loadw result array vector-length-slot other-pointer-lowtag) + (inst mov tmp (fixnumize gap)) + (inst cmp result tmp) + (inst cmov :ng tmp result) + (inst sub result tmp))) + +(defmacro with-sse-data (((sap-var data-var array) (offset-var index)) &body code) + ;; Compute a SAP and offset for the specified array and index. Check bounds. + (with-unique-names (data-index data-end elt-shift access-size) + (once-only ((array array) + (index index)) + `(locally + (declare (optimize (insert-array-bounds-checks 0))) + (with-array-data ((,data-var ,array) + (,data-index ,index) + (,data-end)) + (let* ((,sap-var (int-sap (get-lisp-obj-address ,data-var))) + (,elt-shift (sse-elt-shift-of ,data-var)) + (,access-size (ash 16 (- ,elt-shift))) + (,offset-var (+ (ash ,data-index ,elt-shift) +vector-data-fixup+))) + (declare (type system-area-pointer ,sap-var) + (type fixnum ,offset-var)) + (unless (<= 0 ,data-index (+ ,data-index ,access-size) ,data-end) + (array-bounding-indices-bad-error ,array ,index (+ ,index ,access-size))) + ,@code)))))) + +(defun sse-array-info-or-give-up (lvar) + ;; Look up the SSE element size and check if it is definitely a vector + (let ((type (lvar-type lvar))) + (unless (and (array-type-p type) + (not (array-type-complexp type))) + (give-up-ir1-transform "not a simple array")) + (let* ((etype (array-type-specialized-element-type type)) + (shift (sse-elt-shift-from-saetp + (if (eq etype *wild-type*) nil + (find-saetp-by-ctype etype))))) + (unless shift + (give-up-ir1-transform "not a known SSE-compatible array element type: ~S" + (type-specifier etype))) + (values (ash 1 shift) ; step + (1- (ash 16 (- shift))) ; gap + (and (listp (array-type-dimensions type)) + (if (null (cdr (array-type-dimensions type))) :yes :no)))))) + +(defmacro def-aref-intrinsic (postfix rtype reader writer &key (check-bounds t)) + (let* ((rm-aref (symbolicate "ROW-MAJOR-AREF-" postfix)) + (rm-aset (if writer (symbolicate "ROW-MAJOR-ASET-" postfix))) + (aref (symbolicate "AREF-" postfix)) + (aset (if writer (symbolicate "%ASET-" postfix))) + (reader-vop (symbolicate "%" reader)) + (reader/ix-vop (symbolicate "%" reader "/IX")) + (writer-vop (if writer (symbolicate "%" writer))) + (writer/ix-vop (if writer (symbolicate "%" writer "/IX"))) + (rtype (or rtype '(values))) + (index-expression + (ecase check-bounds + (t ``(the index (%check-bound array (%sse-array-size array ,gap) index))) + ((nil) ``(the index index)) + (:no-gap ``(the index (%check-bound array (%sse-array-size array 0) index)))))) + `(progn + ;; ROW-MAJOR-AREF + (export ',rm-aref) + (defknown ,rm-aref (array index) ,rtype (foldable flushable)) + (defun ,rm-aref (array index) + (with-sse-data ((sap data array) + (offset index)) + (,reader-vop sap offset))) + ;; + (deftransform ,rm-aref ((array index) (simple-array t) * :important t) + ,(format nil "open-code ~A" rm-aref) + (multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array) + (declare (ignorable gap)) + `(,',reader/ix-vop (array-data-expr array ,is-vector) + ,,index-expression + ,step ,+vector-data-fixup+))) + ;; AREF + (export ',aref) + (defknown ,aref (array &rest index) ,rtype (foldable flushable)) + (defun ,aref (array &rest indices) + (declare (truly-dynamic-extent indices)) + (with-sse-data ((sap data array) + (offset (%array-row-major-index array indices))) + (,reader-vop sap offset))) + ;; + (defoptimizer (,aref derive-type) ((array &rest indices) node) + (assert-array-rank array (length indices)) + (values-specifier-type ',rtype)) + (deftransform ,aref ((array &rest indices) (simple-array &rest t) * :important t) + ,(format nil "open-code ~A" aref) + (multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array) + (declare (ignorable gap)) + (let ((syms (make-gensym-list (length indices)))) + `(lambda (array ,@syms) + (let ((index ,(if (eq is-vector :yes) (first syms) + `(array-row-major-index array ,@syms)))) + (,',reader/ix-vop (array-data-expr array ,is-vector) + ,,index-expression + ,step ,+vector-data-fixup+)))))) + ,@(if writer + `(;; ROW-MAJOR-ASET + (defknown ,rm-aset (array index sse-pack) ,rtype (unsafe)) + (defsetf ,rm-aref ,rm-aset) + (defun ,rm-aset (array index new-value) + (with-sse-data ((sap data array) + (offset index)) + (,writer-vop sap offset (the ,rtype new-value)) + new-value)) + ;; + (deftransform ,rm-aset ((array index value) (simple-array t t) * :important t) + ,(format nil "open-code ~A" rm-aset) + (multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array) + (declare (ignorable gap)) + `(progn + (,',writer/ix-vop (array-data-expr array ,is-vector) + ,,index-expression + ,step ,+vector-data-fixup+ + (the sse-pack value)) + value))) + ;; %ASET + (defknown ,aset (array &rest t) ,rtype (unsafe)) + (defsetf ,aref ,aset) + (defun ,aset (array &rest stuff) + (let ((new-value (car (last stuff)))) + (with-sse-data ((sap data array) + (offset (%array-row-major-index array (nbutlast stuff)))) + (,writer-vop sap offset (the ,rtype new-value)) + new-value))) + ;; + (defoptimizer (,aset derive-type) ((array &rest stuff) node) + (assert-array-rank array (1- (length stuff))) + (assert-lvar-type (car (last stuff)) (specifier-type 'sse-pack) + (lexenv-policy (node-lexenv node))) + (specifier-type ',rtype)) + (deftransform ,aset ((array &rest stuff) (simple-array &rest t) * :important t) + ,(format nil "open-code ~A" aset) + (multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array) + (declare (ignorable gap)) + (let ((syms (make-gensym-list (length stuff)))) + `(lambda (array ,@syms) + (let ((index (the index ,(if (eq is-vector :yes) (first syms) + `(array-row-major-index array ,@(butlast syms)))))) + (,',writer/ix-vop (array-data-expr array ,is-vector) + ,,index-expression + ,step ,+vector-data-fixup+ + (the sse-pack ,(car (last syms))))) + ,(car (last syms))))))))))) + diff --git a/contrib/cl-simd/sbcl-core.lisp b/contrib/cl-simd/sbcl-core.lisp new file mode 100644 index 000000000..c32b9d032 --- /dev/null +++ b/contrib/cl-simd/sbcl-core.lisp @@ -0,0 +1,557 @@ +;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- +;;; +;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) +;;; +;;; This file contains definitions of abstract VOPs, macros +;;; and utility functions used to implement the intrinsics. +;;; + +(in-package #:SSE) + +;;; The specific pack types + +(deftype int-sse-pack () '(sse-pack integer)) +(deftype float-sse-pack () '(sse-pack single-float)) +(deftype double-sse-pack () '(sse-pack double-float)) + +;;; Helper functions + +(defconstant +uint32-mask+ #xFFFFFFFF) +(defconstant +uint64-mask+ #xFFFFFFFFFFFFFFFF) +(defconstant +min-int32+ (- (ash 1 31))) +(defconstant +max-int32+ (1- (ash 1 31))) + +(defun type-name-to-primitive (lt) + (primitive-type-name (primitive-type (specifier-type lt)))) + +(defun move-cmd-for-type (lt) + (ecase lt + (int-sse-pack 'movdqa) + ((float-sse-pack double-sse-pack) 'movaps))) + +(defun ensure-reg-or-mem (tn) + (sc-case tn + ((sse-pack-immediate immediate) + (register-inline-constant (tn-value tn))) + (t tn))) + +(defmacro ensure-load (type tgt src) + `(unless (location= ,tgt ,src) + (inst ,(move-cmd-for-type type) ,tgt (ensure-reg-or-mem ,src)))) + +(defmacro ensure-move (type tgt src) + `(unless (location= ,tgt ,src) + (inst ,(move-cmd-for-type type) ,tgt ,src))) + +(defmacro save-intrinsic-spec (name info) + `(eval-when (:compile-toplevel :load-toplevel :execute) + (setf (get ',name 'intrinsic-spec) ',info))) + +(defmacro def-splice-transform (name args &body code) + (let* ((direct-args (mapcar (lambda (x) (if (consp x) (gensym) x)) args)) + (flat-args (mapcan (lambda (x) (if (consp x) (copy-list (rest x)) (list x))) args))) + `(deftransform ,name ((,@direct-args) * *) + ,(format nil "Simplify combination ~A" (cons name args)) + ,@(loop for spec in args and name in direct-args + when (consp spec) + collect `(splice-fun-args ,name ',(first spec) ,(1- (length spec)))) + (list* 'lambda ',flat-args ',code)))) + +;;; Index-offset splicing + +(defun fold-index-addressing (fun-name index scale &key setter-p) + (multiple-value-bind (func index-args) (extract-fun-args index '(+ - * ash) 2) + (destructuring-bind (x constant) index-args + (declare (ignorable x)) + (unless (constant-lvar-p constant) + (give-up-ir1-transform)) + (let ((value (lvar-value constant)) + (scale-value (lvar-value scale))) + (case func + (* (unless (typep (* value scale-value) '(signed-byte 32)) + (give-up-ir1-transform "constant is too large for inlining"))) + (ash (unless (and (>= value 0) + (typep (ash scale-value value) '(signed-byte 32))) + (give-up-ir1-transform "index shift is unsuitable for inlining")))) + (splice-fun-args index func 2) + (let* ((value-arg (when setter-p '(value))) + (is-scale (member func '(* ash))) + (new-scale (if is-scale `(,func scale const) 'scale)) + (new-offset (if is-scale 'offset `(,func offset (* const scale))))) + `(lambda (thing index const scale offset ,@value-arg) + (,fun-name thing index ,new-scale ,new-offset ,@value-arg))))))) + +;;; Index-offset addressing + +(defun is-tagged-load-scale (value) + (not (logtest value (1- (ash 1 n-fixnum-tag-bits))))) + +(deftype tagged-load-scale () + '(and fixnum (satisfies is-tagged-load-scale))) + +(defun find-lea-scale (scale) + (cond ((not (logtest scale 7)) (values (/ scale 8) 8)) + ((not (logtest scale 3)) (values (/ scale 4) 4)) + ((not (logtest scale 1)) (values (/ scale 2) 2)) + (t (values scale 1)))) + +(defun reduce-offset (ioffset scale offset) + "Redistribute value from ioffset to offset, while keeping offset int32." + (let* ((istep (if (< ioffset 0) -1 1)) + (icount (max 0 + (if (< ioffset 0) + (- (1+ +min-int32+) ioffset) ; = (- +max-int32+) + (- ioffset +max-int32+)))) + (ostep (* istep scale)) + (ocount (truncate (- (if (> ostep 0) +max-int32+ +min-int32+) offset) + ostep)) + (count (min ocount icount))) + (values (- ioffset (* count istep)) + (+ offset (* count ostep))))) + +(defun split-offset (offset scale) + (if (typep offset '(signed-byte 32)) + (values 0 offset) + (multiple-value-bind (div rem) (floor offset scale) + (assert (typep rem '(signed-byte 32))) + (if (typep div '(signed-byte 32)) + (values div rem) + (reduce-offset div scale rem))))) + +(defun power-of-2? (scale) + (and (> scale 0) (not (logtest scale (1- scale))))) + +(defun find-power-of-2 (scale) + (assert (power-of-2? scale)) + (loop for i from 0 and sv = scale then (ash sv -1) + when (<= sv 1) return i)) + +(defun make-scaled-ea (size sap index scale offset tmp &key fixnum-index) + "Returns an ea representing the given index*scale + offset formula. +May emit additional instructions using the temporary register." + (assemble () + (if (or (sc-is index immediate) (= scale 0)) + ;; Fully constant offset + (let ((value (if (= scale 0) offset + (+ (* (tn-value index) scale) offset)))) + (assert (typep value '(signed-byte 64))) + (if (typep value '(signed-byte 32)) + (make-ea size :base sap :disp value) + (progn + (inst mov tmp (register-inline-constant value)) + (make-ea size :base sap :index tmp)))) + ;; Indexing + (progn + (when (sc-is index any-reg) + (assert (and fixnum-index (is-tagged-load-scale scale))) + (setf scale (ash scale (- n-fixnum-tag-bits)))) + (multiple-value-bind (rscale lscale) (find-lea-scale scale) + ;; One-instruction case? + (if (and (= rscale 1) (typep offset '(signed-byte 32))) + (make-ea size :base sap :index index :scale scale :disp offset) + ;; Use temporary + (multiple-value-bind (roffset loffset) (split-offset offset lscale) + (labels ((negate-when-<0 (register scale) + (when (< scale 0) + (inst neg register))) + (emit-shift-mul (register scale) + (inst shl register (find-power-of-2 (abs scale))) + (negate-when-<0 register scale)) + ;; Tries to compute tmp via LEA + (try-use-lea (scale &optional base) + (multiple-value-bind (rrscale rlscale) (find-lea-scale scale) + (when (and (= (abs rrscale) 1) (typep (* rrscale roffset) '(signed-byte 32))) + (when (and (= roffset 0) (null base)) ; minimize loffset + (multiple-value-setq (roffset loffset) (floor offset lscale))) + (let ((xoffset (* rrscale roffset))) + (inst lea tmp + (if (and (= rlscale 1) (null base)) + (make-ea :byte :base index :disp xoffset) + (make-ea :byte :base base :index index :scale rlscale :disp xoffset)))) + (negate-when-<0 tmp rrscale) + :success)))) + (declare (inline negate-when-<0 emit-shift-mul)) + (cond + ;; same register shift? + ((and (= roffset 0) (location= tmp index) (power-of-2? (abs rscale))) + (emit-shift-mul tmp rscale)) + ;; one LEA? + ((try-use-lea rscale)) + ((try-use-lea (1- rscale) index)) + ;; Generic case, use mul/shl and add + (t + (if (power-of-2? (abs rscale)) + (progn + (move tmp index) + (emit-shift-mul tmp rscale)) + (inst imul tmp index rscale)) + (unless (= roffset 0) + ;; Make loffset as small as possible + (multiple-value-setq (roffset loffset) (floor offset lscale)) + (if (typep roffset '(signed-byte 32)) + (inst add tmp roffset) + (inst add tmp (register-inline-constant roffset)))))) + (make-ea size :base sap :index tmp :scale lscale :disp loffset))))))))) + +;; Initialization + +(defmacro def-float-set-intrinsic (&whole whole pubname fname atype aregtype rtype move) + (declare (ignore pubname)) + `(progn + (save-intrinsic-spec ,fname ,whole) + (defknown ,fname (,atype) ,rtype (foldable flushable)) + (define-vop (,fname) + (:translate ,fname) + (:args (arg :scs (,aregtype) :target dst)) + (:arg-types ,atype) + (:results (dst :scs (sse-reg))) + (:result-types ,(type-name-to-primitive rtype)) + (:policy :fast-safe) + (:generator 1 + (unless (location= dst arg) + (inst ,move dst arg)))))) + +;; Unary operations + +(define-vop (sse-unary-base-op) + ;; no immediate because expecting to be folded + (:args (x :scs (sse-reg))) + (:arg-types sse-pack) + (:policy :fast-safe) + (:note "inline SSE unary operation") + (:vop-var vop) + (:save-p :compute-only)) + +(define-vop (sse-unary-op sse-unary-base-op) + (:args (x :scs (sse-reg) :target r)) + (:results (r :scs (sse-reg)))) + +(define-vop (sse-unary-to-int-op sse-unary-base-op) + (:results (r :scs (signed-reg)))) + +(define-vop (sse-unary-to-uint-op sse-unary-base-op) + (:results (r :scs (unsigned-reg)))) + +(defmacro def-unary-intrinsic (&whole whole name rtype insn cost c-name &key partial immediate-arg result-size arg-type) + (declare (ignore c-name arg-type)) + (let* ((imm (if immediate-arg '(imm))) + (immt (if immediate-arg (list immediate-arg)))) + (assert (or (not partial) (not (subtypep rtype 'integer)))) + `(progn + (export ',name) + (save-intrinsic-spec ,name ,whole) + (defknown ,name (sse-pack ,@immt) ,rtype (foldable flushable)) + (define-vop (,name ,(cond ((subtypep rtype 'unsigned-byte) + 'sse-unary-to-uint-op) + ((subtypep rtype 'integer) + 'sse-unary-to-int-op) + (t 'sse-unary-op))) + (:translate ,name) + (:result-types ,(type-name-to-primitive rtype)) + ,@(if immediate-arg + `((:arg-types sse-pack (:constant ,immediate-arg)) + (:info imm))) + (:generator ,cost + ,@(ecase partial + (:one-arg `((ensure-move ,rtype r x) + (inst ,insn r ,@imm))) + (t `((ensure-move ,rtype r x) + (inst ,insn r r ,@imm))) + ((nil) `((inst ,insn + ,(if result-size `(reg-in-size r ,result-size) 'r) + x ,@imm))))))))) + +;; Unary to int32 & sign-extend + +(define-vop (sse-cvt-to-int32-op sse-unary-base-op) + (:temporary (:sc signed-reg :offset rax-offset :target r :to :result) rax) + (:results (r :scs (signed-reg)))) + +(defmacro def-cvt-to-int32-intrinsic (name rtype insn cost c-name &key arg-type) + (declare (ignore arg-type)) + `(progn + (export ',name) + (save-intrinsic-spec ,name (def-unary-intrinsic ,name ,rtype ,insn ,cost ,c-name)) + (defknown ,name (sse-pack) (signed-byte 32) (foldable flushable)) + (define-vop (,name sse-cvt-to-int32-op) + (:translate ,name) + (:result-types ,(type-name-to-primitive rtype)) + (:generator ,cost + (inst ,insn (reg-in-size rax :dword) x) + (inst cdqe) + (move r rax))))) + +;; NOT intrinsics + +(define-vop (sse-not-op sse-unary-op) + (:temporary (:sc sse-reg) tmp)) + +(defmacro def-not-intrinsic (name rtype insn) + `(progn + (export ',name) + (save-intrinsic-spec ,name (def-unary-intrinsic ,name ,rtype ,insn 3 nil)) + (defknown ,name (sse-pack) ,rtype (foldable flushable)) + (define-vop (,name sse-not-op) + (:translate ,name) + (:result-types ,(type-name-to-primitive rtype)) + (:generator 3 + (if (location= x r) + (progn + (inst pcmpeqd tmp tmp) + (inst ,insn r tmp)) + (progn + (inst pcmpeqd r r) + (inst ,insn r x))))))) + +;; Binary operations + +(define-vop (sse-binary-base-op) + (:args (x :scs (sse-reg sse-pack-immediate) :target r) + (y :scs (sse-reg sse-pack-immediate))) + (:results (r :scs (sse-reg))) + (:arg-types sse-pack sse-pack) + (:policy :fast-safe) + (:note "inline SSE binary operation") + (:vop-var vop) + (:save-p :compute-only)) + +(define-vop (sse-binary-op sse-binary-base-op) + (:temporary (:sc sse-reg) tmp)) + +(define-vop (sse-binary-comm-op sse-binary-base-op) + (:args (x :scs (sse-reg sse-pack-immediate) :target r) + (y :scs (sse-reg sse-pack-immediate) :target r))) + +(defmacro def-binary-intrinsic (&whole whole name rtype insn cost c-name &key commutative tags immediate-arg x-type y-type) + (declare (ignore c-name x-type y-type)) + (let* ((imm (if immediate-arg '(imm))) + (immt (if immediate-arg (list immediate-arg)))) + `(progn + (export ',name) + (save-intrinsic-spec ,name ,whole) + (defknown ,name (sse-pack sse-pack ,@immt) ,rtype (foldable flushable)) + (define-vop (,name ,(if commutative 'sse-binary-comm-op 'sse-binary-op)) + (:translate ,name) + (:result-types ,(type-name-to-primitive rtype)) + ,@(if immediate-arg + `((:arg-types sse-pack sse-pack (:constant ,immediate-arg)) + (:info imm))) + (:generator ,cost + ,@(if commutative + `((when (location= y r) + (rotatef x y)) + (ensure-load ,rtype r x) + (inst ,insn ,@tags r (ensure-reg-or-mem y) ,@imm)) + `((unless (location= y r) + (setf tmp r)) + (ensure-load ,rtype tmp x) + (inst ,insn ,@tags tmp (ensure-reg-or-mem y) ,@imm) + (ensure-move ,rtype r tmp)))))))) + +;;; XMM/Integer combination intrinsics + +(define-vop (sse-int-base-op) + (:results (r :scs (sse-reg))) + (:policy :fast-safe) + (:note "inline SSE/integer operation") + (:vop-var vop) + (:save-p :compute-only)) + +(define-vop (sse-int-op sse-int-base-op) + (:args (x :scs (sse-reg sse-pack-immediate) :target r) + (iv :scs (signed-reg signed-stack immediate))) + (:arg-types sse-pack signed-num)) + +(define-vop (sse-uint-op sse-int-base-op) + (:args (x :scs (sse-reg sse-pack-immediate) :target r) + (iv :scs (unsigned-reg unsigned-stack immediate))) + (:arg-types sse-pack unsigned-num)) + +(defmacro def-sse-int-intrinsic (&whole whole name itype rtype insn cost c-name &key make-temporary immediate-arg) + (declare (ignore c-name)) + (let* ((imm (if immediate-arg '(imm))) + (immt (if immediate-arg (list immediate-arg))) + (unsigned? (subtypep itype 'unsigned-byte))) + `(progn + (export ',name) + (save-intrinsic-spec ,name ,whole) + (defknown ,name (sse-pack ,itype ,@immt) ,rtype (foldable flushable)) + (define-vop (,name ,(if unsigned? 'sse-uint-op 'sse-int-op)) + (:translate ,name) + (:result-types ,(type-name-to-primitive rtype)) + ,@(if immediate-arg + `((:arg-types sse-pack + ,(if unsigned? 'unsigned-num 'signed-num) + (:constant ,immediate-arg)) + (:info imm))) + ,@(if make-temporary + `((:temporary (:sc sse-reg) tmp))) + (:generator ,cost + (ensure-load ,rtype r x) + ,@(if (eq make-temporary t) + '((inst movd tmp (ensure-reg-or-mem iv))) + make-temporary) + (inst ,insn r ,(if make-temporary 'tmp '(ensure-reg-or-mem iv)) ,@imm)))))) + +;;; Memory intrinsics + +(define-vop (sse-load-base-op) + (:results (r :scs (sse-reg))) + (:policy :fast-safe) + (:note "inline SSE load operation")) + +(define-vop (sse-load-op sse-load-base-op) + (:args (sap :scs (sap-reg)) + (offset :scs (signed-reg))) + (:arg-types system-area-pointer signed-num)) + +(define-vop (sse-xmm-load-op sse-load-base-op) + (:args (value :scs (sse-reg sse-pack-immediate) :target r) + (sap :scs (sap-reg)) + (offset :scs (signed-reg))) + (:arg-types sse-pack system-area-pointer signed-num)) + +(define-vop (sse-load-imm-op sse-load-base-op) + (:args (sap :scs (sap-reg))) + (:arg-types system-area-pointer + (:constant (signed-byte 32))) + (:info offset)) + +(define-vop (sse-xmm-load-imm-op sse-load-base-op) + (:args (value :scs (sse-reg sse-pack-immediate) :target r) + (sap :scs (sap-reg))) + (:arg-types sse-pack system-area-pointer + (:constant (signed-byte 32))) + (:info offset)) + +(define-vop (sse-load-ix-op sse-load-base-op) + (:args (sap :scs (descriptor-reg) :to :eval) + (index :scs (signed-reg immediate) :target tmp)) + (:arg-types * signed-num (:constant fixnum) (:constant fixnum)) + (:temporary (:sc signed-reg :from (:argument 1)) tmp) + (:info scale offset)) + +(define-vop (sse-load-ix-op/tag sse-load-base-op) + (:args (sap :scs (descriptor-reg) :to :eval) + (index :scs (any-reg signed-reg immediate) :target tmp)) + (:arg-types * tagged-num (:constant tagged-load-scale) (:constant fixnum)) + (:temporary (:sc any-reg :from (:argument 1)) tmp) + (:info scale offset)) + +(defmacro def-load-intrinsic (&whole whole name rtype insn c-name + &key register-arg tags postfix-fmt (size :qword)) + (declare (ignore c-name postfix-fmt)) + (let* ((vop (symbolicate "%" name)) + (c-vop (symbolicate vop "-C")) + (ix-vop (symbolicate vop "/IX")) + (valtype (if register-arg '(sse-pack))) + (valarg (if register-arg '(value))) + (r-arg (if rtype '(r))) + (rtypes (if rtype + `(:result-types ,(type-name-to-primitive rtype)) + `(:results)))) + (assert (or rtype (not register-arg))) + `(progn + (export ',name) + (save-intrinsic-spec ,name ,whole) + (defknown ,vop (,@valtype system-area-pointer fixnum) ,(or rtype '(values)) (flushable always-translatable)) + (define-vop (,vop ,(if register-arg 'sse-xmm-load-op 'sse-load-op)) + (:translate ,vop) + ,rtypes + (:generator 5 + ,(if register-arg `(ensure-load ,rtype r value)) + (inst ,insn ,@tags ,@r-arg (make-ea ,size :base sap :index offset)))) + (define-vop (,c-vop ,(if register-arg 'sse-xmm-load-imm-op 'sse-load-imm-op)) + (:translate ,vop) + ,rtypes + (:generator 4 + ,(if register-arg `(ensure-load ,rtype r value)) + (inst ,insn ,@tags ,@r-arg (make-ea ,size :base sap :disp offset)))) + (def-splice-transform ,vop (,@valarg (sap+ sap offset1) offset2) + (,vop ,@valarg sap (+ offset1 offset2))) + ,@(if (null register-arg) + `(;; Vector indexing version + (defknown ,ix-vop (simple-array fixnum fixnum fixnum) ,(or rtype '(values)) + (flushable always-translatable)) + (define-vop (,ix-vop sse-load-ix-op) + (:translate ,ix-vop) + ,rtypes + (:generator 4 + (inst ,insn ,@tags ,@r-arg (make-scaled-ea ,size sap index scale offset tmp)))) + (define-vop (,(symbolicate ix-vop "/TAG") sse-load-ix-op/tag) + (:translate ,ix-vop) + ,rtypes + (:generator 3 + (inst ,insn ,@tags ,@r-arg (make-scaled-ea ,size sap index scale offset tmp :fixnum-index t)))) + (deftransform ,ix-vop ((thing index scale offset)) + "fold semi-constant index expressions" + (fold-index-addressing ',ix-vop index scale))))))) + +(define-vop (sse-store-base-op) + (:policy :fast-safe) + (:note "inline SSE store operation")) + +(define-vop (sse-store-op sse-store-base-op) + (:args (sap :scs (sap-reg)) + (offset :scs (signed-reg)) + (value :scs (sse-reg))) + (:arg-types system-area-pointer signed-num sse-pack)) + +(define-vop (sse-store-imm-op sse-store-base-op) + (:args (sap :scs (sap-reg)) + (value :scs (sse-reg))) + (:arg-types system-area-pointer + (:constant (signed-byte 32)) + sse-pack) + (:info offset)) + +(define-vop (sse-store-ix-op sse-store-base-op) + (:args (sap :scs (descriptor-reg) :to :eval) + (index :scs (signed-reg immediate) :target tmp) + (value :scs (sse-reg))) + (:arg-types * signed-num (:constant fixnum) (:constant fixnum) sse-pack) + (:temporary (:sc signed-reg :from (:argument 1)) tmp) + (:info scale offset)) + +(define-vop (sse-store-ix-op/tag sse-store-base-op) + (:args (sap :scs (descriptor-reg) :to :eval) + (index :scs (any-reg signed-reg immediate) :target tmp) + (value :scs (sse-reg))) + (:arg-types * tagged-num (:constant tagged-load-scale) (:constant fixnum) sse-pack) + (:temporary (:sc any-reg :from (:argument 1)) tmp) + (:info scale offset)) + +(defmacro def-store-intrinsic (&whole whole name rtype insn c-name &key setf-name) + (declare (ignore rtype c-name)) + (let* ((vop (symbolicate "%" name)) + (c-vop (symbolicate vop "-C")) + (ix-vop (symbolicate vop "/IX"))) + `(progn + ,(unless setf-name `(export ',name)) + (save-intrinsic-spec ,name ,whole) + (defknown ,vop (system-area-pointer fixnum sse-pack) (values) (unsafe always-translatable)) + (define-vop (,vop sse-store-op) + (:translate ,vop) + (:generator 5 + (inst ,insn (make-ea :qword :base sap :index offset) value))) + (define-vop (,c-vop sse-store-imm-op) + (:translate ,vop) + (:generator 4 + (inst ,insn (make-ea :qword :base sap :disp offset) value))) + (def-splice-transform ,vop ((sap+ sap offset1) offset2 new-value) + (,vop sap (+ offset1 offset2) new-value)) + ;; Vector indexing version + (defknown ,ix-vop (simple-array fixnum fixnum fixnum sse-pack) (values) + (unsafe always-translatable)) + (define-vop (,ix-vop sse-store-ix-op) + (:translate ,ix-vop) + (:generator 4 + (inst ,insn (make-scaled-ea :qword sap index scale offset tmp) value))) + (define-vop (,(symbolicate ix-vop "/TAG") sse-store-ix-op/tag) + (:translate ,ix-vop) + (:generator 3 + (inst ,insn (make-scaled-ea :qword sap index scale offset tmp :fixnum-index t) value))) + (deftransform ,ix-vop ((thing index scale offset value)) + "fold semi-constant index expressions" + (fold-index-addressing ',ix-vop index scale :setter-p t))))) + diff --git a/contrib/cl-simd/sbcl-functions.lisp b/contrib/cl-simd/sbcl-functions.lisp new file mode 100644 index 000000000..0c836cff6 --- /dev/null +++ b/contrib/cl-simd/sbcl-functions.lisp @@ -0,0 +1,454 @@ +;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- +;;; +;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) +;;; +;;; This file implements VOP-wrapping functions and non-primitive +;;; extensions to the core intrinsic set. +;;; + +(in-package #:SSE) + +;;; Materialize the intrinsic functions. + +;; Since VOPs are activated only on load, actual functions that +;; wrap them have to be defined in a different file. This is a +;; hack to generate the functions from the same macro invocations +;; as the VOPS. + +(macrolet ((def-float-set-intrinsic (pubname fname atype aregtype rtype move) + (declare (ignore aregtype move)) + `(progn + (defun ,fname (arg) + (declare (type ,atype arg)) + (truly-the ,rtype (%primitive ,fname arg))) + ;; Public function - includes coercion + (export ',pubname) + (declaim (ftype (function (real) ,rtype) ,pubname) + (inline ,pubname)) + (defun ,pubname (arg) (,fname (coerce arg ',atype))))) + (def-unary-intrinsic (name rtype insn cost c-name &key immediate-arg &allow-other-keys) + (declare (ignore insn cost c-name)) + (unless immediate-arg + `(defun ,name (x) + (declare (type sse-pack x)) + (truly-the ,rtype (%primitive ,name x))))) + (def-binary-intrinsic (name rtype insn cost c-name &key immediate-arg &allow-other-keys) + (declare (ignore insn cost c-name)) + (unless immediate-arg + `(defun ,name (x y ,@(if immediate-arg '(imm))) + (declare (type sse-pack x y)) + (truly-the ,rtype (%primitive ,name x y))))) + (def-sse-int-intrinsic (name itype rtype insn cost c-name &key immediate-arg &allow-other-keys) + (declare (ignore insn cost c-name)) + (unless immediate-arg + `(defun ,name (x iv) + (declare (type sse-pack x) + (type ,itype iv)) + (truly-the ,rtype (%primitive ,name x iv))))) + (def-load-intrinsic (name rtype insn c-name &key register-arg &allow-other-keys) + (declare (ignore insn c-name)) + (let* ((vop (symbolicate "%" name)) + (valarg (if register-arg '(value)))) + `(progn + (declaim (inline ,name)) + (defun ,name (,@valarg pointer &optional (offset 0)) + (declare ,@(if register-arg '((type sse-pack value))) + (type system-area-pointer pointer) + (type fixnum offset)) + ,(if rtype + `(truly-the ,rtype (,vop ,@valarg pointer offset)) + `(,vop ,@valarg pointer offset)))))) + (def-store-intrinsic (name rtype insn c-name &key setf-name &allow-other-keys) + (declare (ignore insn c-name)) + (let* ((vop (symbolicate "%" name))) + `(progn + (declaim (inline ,name)) + (defun ,name (pointer value &optional (offset 0)) + (declare (type system-area-pointer pointer) + (type sse-pack value) + (type fixnum offset)) + (,vop pointer offset value) + (truly-the ,rtype value)) + ,(if setf-name + `(defsetf ,setf-name (pointer &optional (offset 0)) (value) + `(,',name ,pointer ,value ,offset))))))) + ;; Load the definition list + #.(loop for name being each present-symbol + when (get name 'intrinsic-spec) + collect it into specs + finally (return `(progn ,@specs)))) + +;;; Helper functions and macros + +(defmacro def-utility (name args rtype &body code) + `(progn + (export ',name) + (declaim (ftype (function ,(mapcar (constantly 'sse-pack) args) ,rtype) ,name) + (inline ,name)) + (defun ,name ,args ,@code))) + +(defmacro def-if-function (name rtype postfix) + (let* ((not-x (symbolicate "NOT-" postfix)) + (or-x (symbolicate "OR-" postfix)) + (and-x (symbolicate "AND-" postfix)) + (andn-x (symbolicate "ANDNOT-" postfix)) + (xor-x (symbolicate "XOR-" postfix)) + (true (%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF)) + (false (%make-sse-pack 0 0))) + `(progn + (export ',name) + (defknown ,name (sse-pack sse-pack sse-pack) ,rtype (foldable flushable)) + (defun ,name (condition true-val false-val) + (,or-x (,and-x condition true-val) + (,andn-x condition false-val))) + ;; Instead of inlining, use a transform so that the splice + ;; rule has a chance to apply. This depends on transform + ;; definitions behaving like a LIFO: + (deftransform ,name ((condition true-val false-val) * *) + "Expand the conditional." + '(,or-x (,and-x condition true-val) (,andn-x condition false-val))) + (def-splice-transform ,name ((,not-x cond) tv fv) (,name cond fv tv)) + ;; NOT elimination and partial constant folding for bitwise ops: + (def-splice-transform ,not-x ((,not-x arg1)) arg1) + (def-splice-transform ,and-x (arg1 (,not-x arg2)) (,andn-x arg2 arg1)) + (def-splice-transform ,and-x ((,not-x arg1) arg2) (,andn-x arg1 arg2)) + (def-splice-transform ,andn-x ((,not-x arg1) arg2) (,and-x arg1 arg2)) + (%deftransform ',or-x '(function * *) #'commutative-arg-swap "place constant arg last") + (%deftransform ',and-x '(function * *) #'commutative-arg-swap "place constant arg last") + (%deftransform ',xor-x '(function * *) #'commutative-arg-swap "place constant arg last") + (deftransform ,or-x ((arg1 arg2) (* (constant-arg (member ,true))) *) ,true) + (deftransform ,or-x ((arg1 arg2) (* (constant-arg (member ,false))) *) 'arg1) + (deftransform ,and-x ((arg1 arg2) (* (constant-arg (member ,true))) *) 'arg1) + (deftransform ,and-x ((arg1 arg2) (* (constant-arg (member ,false))) *) ,false) + (deftransform ,xor-x ((arg1 arg2) (* (constant-arg (member ,false))) *) 'arg1) + (deftransform ,andn-x ((arg1 arg2) (* (constant-arg (member ,true))) *) 'arg1) + (deftransform ,andn-x ((arg1 arg2) (* (constant-arg (member ,false))) *) ,false) + (deftransform ,andn-x ((arg1 arg2) ((constant-arg (member ,true)) *) *) ,false) + (deftransform ,andn-x ((arg1 arg2) ((constant-arg (member ,false)) *) *) 'arg2)))) + +(defmacro def-not-cmp-pairs (not-fun &rest pairs) + `(progn + ,@(loop for (a b) on pairs by #'cddr + collect `(def-splice-transform ,not-fun ((,a arg1 arg2)) (,b arg1 arg2)) + collect `(def-splice-transform ,not-fun ((,b arg1 arg2)) (,a arg1 arg2))))) + +;;; CPU control + +(defun cpu-mxcsr () + (cpu-mxcsr)) + +(defun %set-cpu-mxcsr (x) + (declare (type (unsigned-byte 32) x)) + (%set-cpu-mxcsr x)) + +(defsetf cpu-mxcsr %set-cpu-mxcsr) + +(defun cpu-load-fence () (cpu-load-fence)) +(defun cpu-store-fence () (cpu-store-fence)) +(defun cpu-memory-fence () (cpu-memory-fence)) + +(defun cpu-pause () (cpu-pause)) + +;;; Single-float + +;; Constants + +(define-symbol-macro 0.0-ps (truly-the float-sse-pack #.(%make-sse-pack 0 0))) + +(define-symbol-macro true-ss (truly-the float-sse-pack #.(%make-sse-pack #xFFFFFFFF 0))) +(define-symbol-macro true-ps (truly-the float-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF))) + +(define-symbol-macro false-ss (truly-the float-sse-pack #.(%make-sse-pack 0 0))) +(define-symbol-macro false-ps (truly-the float-sse-pack #.(%make-sse-pack 0 0))) + +;; Initialization + +(declaim (inline set1-ps set-ps setr-ps setzero-ps)) + +(defun set1-ps (val) + (let ((valv (set-ss val))) + (shuffle-ps valv valv 0))) + +(defun set-ps (x3 x2 x1 x0) + (movelh-ps (unpacklo-ps (set-ss x0) (set-ss x1)) + (unpacklo-ps (set-ss x2) (set-ss x3)))) + +(defun setr-ps (x0 x1 x2 x3) + (movelh-ps (unpacklo-ps (set-ss x0) (set-ss x1)) + (unpacklo-ps (set-ss x2) (set-ss x3)))) + +(defun setzero-ps () 0.0-ps) + +;; Arithmetic negation + +(def-utility neg-ss (arg) float-sse-pack + (xor-ps arg #.(%make-sse-pack #x80000000 0))) + +(def-utility neg-ps (arg) float-sse-pack + (xor-ps arg #.(%make-sse-pack #x8000000080000000 #x8000000080000000))) + +;; Bitwise operations + +(def-if-function if-ps float-sse-pack #:ps) + +;; Comparisons + +(def-utility >-ss (x y) float-sse-pack (<-ss y x)) +(def-utility >-ps (x y) float-sse-pack (<-ps y x)) +(def-utility >=-ss (x y) float-sse-pack (<=-ss y x)) +(def-utility >=-ps (x y) float-sse-pack (<=-ps y x)) +(def-utility />-ss (x y) float-sse-pack (/<-ss y x)) +(def-utility />-ps (x y) float-sse-pack (/<-ps y x)) +(def-utility />=-ss (x y) float-sse-pack (/<=-ss y x)) +(def-utility />=-ps (x y) float-sse-pack (/<=-ps y x)) + +(def-not-cmp-pairs not-ps + =-ps /=-ps <-ps /<-ps <=-ps /<=-ps >-ps />-ps >=-ps />=-ps cmpord-ps cmpunord-ps) + +;; Shuffle + +(declaim (inline %sse-pack-to-int %int-to-sse-pack %shuffle-subints)) + +(defun %sse-pack-to-int (pack) + (logior (%sse-pack-low pack) (ash (%sse-pack-high pack) 64))) + +(defun %int-to-sse-pack (val &aux (mask #xFFFFFFFFFFFFFFFF)) + (%make-sse-pack (logand val mask) (logand (ash val -64) mask))) + +(defun %shuffle-subints (xval yval imm bit-cnt &aux (mask (1- (ash 1 bit-cnt)))) + (flet ((bits (idx) + (logand 3 (ash imm (* -2 idx)))) + (val (src idx) + (logand mask (ash src (* (- bit-cnt) idx))))) + (logior (val xval (bits 0)) + (ash (val xval (bits 1)) bit-cnt) + (ash (val yval (bits 2)) (* 2 bit-cnt)) + (ash (val yval (bits 3)) (* 3 bit-cnt))))) + +(defun shuffle-ps (x y imm) + (declare (type sse-pack x y)) + (let* ((xval (%sse-pack-to-int x)) + (yval (%sse-pack-to-int y))) + (truly-the float-sse-pack (%int-to-sse-pack (%shuffle-subints xval yval imm 32))))) + +;;; Double-float + +;; Constants + +(define-symbol-macro 0.0-pd (truly-the double-sse-pack #.(%make-sse-pack 0 0))) + +(define-symbol-macro true-sd (truly-the double-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF 0))) +(define-symbol-macro true-pd (truly-the double-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF))) + +(define-symbol-macro false-sd (truly-the double-sse-pack #.(%make-sse-pack 0 0))) +(define-symbol-macro false-pd (truly-the double-sse-pack #.(%make-sse-pack 0 0))) + +;; Initialization + +(declaim (inline set1-pd set-pd setr-pd setzero-pd)) + +(defun set1-pd (val) + (let ((valv (set-sd val))) + (shuffle-pd valv valv 0))) + +(defun set-pd (x1 x0) + (unpacklo-pd (set-sd x0) (set-sd x1))) + +(defun setr-pd (x0 x1) + (unpacklo-pd (set-sd x0) (set-sd x1))) + +(defun setzero-pd () 0.0-pd) + +;; Arithmetic negation + +(def-utility neg-sd (arg) double-sse-pack + (xor-pd arg #.(%make-sse-pack #x8000000000000000 0))) + +(def-utility neg-pd (arg) double-sse-pack + (xor-pd arg #.(%make-sse-pack #x8000000000000000 #x8000000000000000))) + +;; Bitwise operations + +(def-if-function if-pd double-sse-pack #:pd) + +;; Comparisons + +(def-utility >-sd (x y) double-sse-pack (<-sd y x)) +(def-utility >-pd (x y) double-sse-pack (<-pd y x)) +(def-utility >=-sd (x y) double-sse-pack (<=-sd y x)) +(def-utility >=-pd (x y) double-sse-pack (<=-pd y x)) +(def-utility />-sd (x y) double-sse-pack (/<-sd y x)) +(def-utility />-pd (x y) double-sse-pack (/<-pd y x)) +(def-utility />=-sd (x y) double-sse-pack (/<=-sd y x)) +(def-utility />=-pd (x y) double-sse-pack (/<=-pd y x)) + +(def-not-cmp-pairs not-pd + =-pd /=-pd <-pd /<-pd <=-pd /<=-pd >-pd />-pd >=-pd />=-pd cmpord-pd cmpunord-pd) + +;; Shuffle + +(defun shuffle-pd (x y imm) + (declare (type sse-pack x y)) + (truly-the double-sse-pack + (%make-sse-pack (if (logtest imm 1) (%sse-pack-high x) (%sse-pack-low x)) + (if (logtest imm 2) (%sse-pack-high y) (%sse-pack-low y))))) + +;;; Integer + +;; Constants + +(define-symbol-macro 0-pi (truly-the int-sse-pack #.(%make-sse-pack 0 0))) + +(define-symbol-macro true-pi (truly-the int-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF))) + +(define-symbol-macro false-pi (truly-the int-sse-pack #.(%make-sse-pack 0 0))) + +;; Initialization + +(macrolet ((defset (name type) + `(defun ,name (x) + (declare (type ,type x)) + (,name x)))) + (defset %set-int (signed-byte 64)) + (defset %set-uint (unsigned-byte 64)) + (defset convert-si32-to-pi (signed-byte 32)) + (defset convert-su32-to-pi (unsigned-byte 32)) + (defset convert-si64-to-pi (signed-byte 64)) + (defset convert-su64-to-pi (unsigned-byte 64))) + +(macrolet ((defset1 (name setter type shuffle &rest expands) + `(progn + (export ',name) + (declaim (inline ,name)) + (defun ,name (arg) + (let ((val (,setter (the ,type arg)))) + (declare (type int-sse-pack val)) + ,@(loop for x in expands collect `(setq val (,x val val))) + (shuffle-pi32 val ,shuffle)))))) + (defset1 set1-pi8 %set-int fixnum #4r0000 unpacklo-pi8 unpacklo-pi16) + (defset1 set1-pi16 %set-int fixnum #4r0000 unpacklo-pi16) + (defset1 set1-pi32 %set-int (signed-byte 32) #4r0000) + (defset1 set1-pu32 %set-uint (unsigned-byte 32) #4r0000) + (defset1 set1-pi64 %set-int (signed-byte 64) #4r1010) + (defset1 set1-pu64 %set-uint (unsigned-byte 64) #4r1010)) + +(macrolet ((defset (name rname setter type depth) + (let* ((names (loop for i from 0 below (ash 1 depth) + collect (symbolicate (format nil "X~A" i)))) + (funcs #(unpacklo-pi64 unpacklo-pi32 unpacklo-pi16 unpacklo-pi8)) + (body (loop for i downfrom depth to 0 + for bv = (mapcar (lambda (x) `(,setter (the ,type ,x))) names) + then (loop for (a b) on bv by #'cddr + collect `(,(svref funcs i) ,a ,b)) + finally (return (first bv))))) + `(progn + (export ',name) + (export ',rname) + (declaim (inline ,name ,rname)) + (defun ,name (,@(reverse names)) ,body) + (defun ,rname (,@names) ,body))))) + (defset set-pi8 setr-pi8 %set-int fixnum 4) + (defset set-pi16 setr-pi16 %set-int fixnum 3) + (defset set-pi32 setr-pi32 %set-int (signed-byte 32) 2) + (defset set-pu32 setr-pu32 %set-uint (unsigned-byte 32) 2) + (defset set-pi64 setr-pi64 %set-int (signed-byte 64) 1) + (defset set-pu64 setr-pu64 %set-uint (unsigned-byte 64) 1)) + +(declaim (inline setzero-pi)) +(defun setzero-pi () 0-pi) + +;; Masked move + +(export 'maskmoveu-pi) + +(declaim (inline maskmoveu-pi)) + +(defun maskmoveu-pi (value mask pointer &optional (offset 0)) + (declare (type sse-pack value mask) + (type system-area-pointer pointer) + (type fixnum offset)) + (%maskmoveu-pi value mask pointer offset)) + +;; Arithmetic negation (subtract from 0) + +(macrolet ((frob (name subf) + `(def-utility ,name (arg) int-sse-pack (,subf 0-pi arg)))) + (frob neg-pi8 sub-pi8) + (frob neg-pi16 sub-pi16) + (frob neg-pi32 sub-pi32) + (frob neg-pi64 sub-pi64)) + +;; Bitwise operations + +(def-if-function if-pi int-sse-pack #:pi) + +;; Comparisons + +(def-utility <-pi8 (x y) int-sse-pack (>-pi8 y x)) +(def-utility <-pi16 (x y) int-sse-pack (>-pi16 y x)) +(def-utility <-pi32 (x y) int-sse-pack (>-pi32 y x)) + +(def-utility <=-pi8 (x y) int-sse-pack (not-pi (>-pi8 x y))) +(def-utility <=-pi16 (x y) int-sse-pack (not-pi (>-pi16 x y))) +(def-utility <=-pi32 (x y) int-sse-pack (not-pi (>-pi32 x y))) + +(def-utility >=-pi8 (x y) int-sse-pack (not-pi (>-pi8 y x))) +(def-utility >=-pi16 (x y) int-sse-pack (not-pi (>-pi16 y x))) +(def-utility >=-pi32 (x y) int-sse-pack (not-pi (>-pi32 y x))) + +(def-utility /=-pi8 (x y) int-sse-pack (not-pi (=-pi8 x y))) +(def-utility /=-pi16 (x y) int-sse-pack (not-pi (=-pi16 x y))) +(def-utility /=-pi32 (x y) int-sse-pack (not-pi (=-pi32 x y))) + +;; Shifts + +(defun slli-pi (x imm) + (declare (type sse-pack x)) + (truly-the int-sse-pack + (if (> imm 15) + 0-pi + (%int-to-sse-pack (ash (%sse-pack-to-int x) (* 8 imm)))))) + +(defun srli-pi (x imm) + (declare (type sse-pack x)) + (truly-the int-sse-pack + (if (> imm 15) + 0-pi + (%int-to-sse-pack (ash (%sse-pack-to-int x) (* -8 imm)))))) + +;; Extract & insert + +(defun extract-pi16 (x imm) + (declare (type sse-pack x)) + (logand #xFFFF + (ash (%sse-pack-to-int x) + (- (* 16 (logand imm 7)))))) + +(defun insert-pi16 (x intv imm) + (declare (type sse-pack x)) + (let ((shift (* 16 (logand imm 7)))) + (truly-the int-sse-pack + (%int-to-sse-pack + (logior (logand (%sse-pack-to-int x) + (lognot (ash #xFFFF shift))) + (ash (logand intv #xFFFF) shift)))))) + +;; Shuffle + +(defun shuffle-pi32 (x imm) + (declare (type sse-pack x)) + (let* ((xval (%sse-pack-to-int x))) + (truly-the int-sse-pack (%int-to-sse-pack (%shuffle-subints xval xval imm 32))))) + +(defun shufflelo-pi16 (x imm) + (declare (type sse-pack x)) + (let* ((xval (%sse-pack-low x))) + (truly-the int-sse-pack (%make-sse-pack (%shuffle-subints xval xval imm 16) + (%sse-pack-high x))))) + +(defun shufflehi-pi16 (x imm) + (declare (type sse-pack x)) + (let* ((xval (%sse-pack-high x))) + (truly-the int-sse-pack (%make-sse-pack (%sse-pack-low x) + (%shuffle-subints xval xval imm 16))))) + diff --git a/contrib/cl-simd/sse-array-defs.lisp b/contrib/cl-simd/sse-array-defs.lisp new file mode 100644 index 000000000..040f1ea39 --- /dev/null +++ b/contrib/cl-simd/sse-array-defs.lisp @@ -0,0 +1,61 @@ +;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- +;;; +;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) +;;; +;;; This file contains definitions for vectorized access +;;; to specialized lisp arrays. +;;; + +(in-package #:SSE) + +;;; Prefetch: AREF-PREFETCH-*, ROW-MAJOR-AREF-PREFETCH-* + +(def-aref-intrinsic #:PREFETCH-T0 nil cpu-prefetch-t0 nil :check-bounds nil) +(def-aref-intrinsic #:PREFETCH-T1 nil cpu-prefetch-t1 nil :check-bounds nil) +(def-aref-intrinsic #:PREFETCH-T2 nil cpu-prefetch-t2 nil :check-bounds nil) +(def-aref-intrinsic #:PREFETCH-NTA nil cpu-prefetch-nta nil :check-bounds nil) + +(def-aref-intrinsic #:CLFLUSH nil cpu-clflush nil :check-bounds :no-gap) + +;;; Single-float + +;; AREF-PS, ROW-MAJOR-AREF-PS + +(def-aref-intrinsic #:PS float-sse-pack mem-ref-ps mem-set-ps) + +;; AREF-APS, ROW-MAJOR-AREF-APS (requires alignment) + +(def-aref-intrinsic #:APS float-sse-pack mem-ref-aps mem-set-aps) + +;; AREF-SPS, ROW-MAJOR-AREF-SPS (requires alignment; no write cache) + +(def-aref-intrinsic #:SPS float-sse-pack mem-ref-aps stream-ps) + +;;; Double-float + +;; AREF-PD, ROW-MAJOR-AREF-PD + +(def-aref-intrinsic #:PD double-sse-pack mem-ref-pd mem-set-pd) + +;; AREF-APD, ROW-MAJOR-AREF-APD (requires alignment) + +(def-aref-intrinsic #:APD double-sse-pack mem-ref-apd mem-set-apd) + +;; AREF-SPD, ROW-MAJOR-AREF-SPD (requires alignment; no write cache) + +(def-aref-intrinsic #:SPD double-sse-pack mem-ref-apd stream-pd) + +;;; Integer + +;; AREF-PI, ROW-MAJOR-AREF-PI + +(def-aref-intrinsic #:PI int-sse-pack mem-ref-pi mem-set-pi) + +;; AREF-API, ROW-MAJOR-AREF-API (requires alignment) + +(def-aref-intrinsic #:API int-sse-pack mem-ref-api mem-set-api) + +;; AREF-SPI, ROW-MAJOR-AREF-SPI (requires alignment; no write cache) + +(def-aref-intrinsic #:SPI int-sse-pack mem-ref-api stream-pi) + diff --git a/contrib/cl-simd/sse-intrinsics.lisp b/contrib/cl-simd/sse-intrinsics.lisp new file mode 100644 index 000000000..a92586a70 --- /dev/null +++ b/contrib/cl-simd/sse-intrinsics.lisp @@ -0,0 +1,689 @@ +;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- +;;; +;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) +;;; +;;; This file contains definitions for all SSE intrinsics. +;;; +;;; The macros are defined in the *-core.lisp files. +;;; On SBCL wrapping functions are defined by sbcl-functions.lisp. +;;; + +(in-package #:SSE) + +#+(and ecl (or ecl_min stage1 cross)) +(eval-when (:compile-toplevel) + ;; During the initial bootstrap sequence when the contribs are + ;; compiled, the system does not load fasls after building them. + ;; (For all it knows, it might be cross-compiling to another architecture.) + ;; Work around by loading the macro definition file into the interpreter: + (load (merge-pathnames #P"ecl-sse-core.lisp" *compile-file-truename*))) + +;;; Prefetch + +(def-load-intrinsic cpu-prefetch-t0 nil prefetch "_mm_prefetch" :tags (:t0) :size :byte :postfix-fmt ",_MM_HINT_T0") +(def-load-intrinsic cpu-prefetch-t1 nil prefetch "_mm_prefetch" :tags (:t1) :size :byte :postfix-fmt ",_MM_HINT_T1") +(def-load-intrinsic cpu-prefetch-t2 nil prefetch "_mm_prefetch" :tags (:t2) :size :byte :postfix-fmt ",_MM_HINT_T2") +(def-load-intrinsic cpu-prefetch-nta nil prefetch "_mm_prefetch" :tags (:nta) :size :byte :postfix-fmt ",_MM_HINT_NTA") + +(def-load-intrinsic cpu-clflush nil clflush "_mm_clflush" :size :byte) + +;;; CPU control + +#+sbcl +(progn + (defknown cpu-mxcsr () (unsigned-byte 32) (flushable)) + + (define-vop (cpu-mxcsr) + (:translate cpu-mxcsr) + (:args) (:arg-types) + (:results (result :scs (unsigned-reg))) + (:result-types unsigned-num) + (:temporary (:sc unsigned-stack) tmp) + (:policy :fast-safe) + (:generator 3 + (let ((ea (make-ea :dword :base rbp-tn + :disp (frame-byte-offset (tn-offset tmp))))) + (inst stmxcsr ea) + (inst mov (reg-in-size result :dword) ea)))) + + (defknown %set-cpu-mxcsr ((unsigned-byte 32)) (unsigned-byte 32) (unsafe)) + + (define-vop (%set-cpu-mxcsr) + (:translate %set-cpu-mxcsr) + (:args (value :scs (unsigned-reg unsigned-stack) :target result)) + (:arg-types unsigned-num) + (:results (result :scs (unsigned-reg) + :load-if (not (and (sc-is result unsigned-stack) + (or (sc-is value unsigned-reg) + (location= value result)))))) + (:result-types unsigned-num) + (:temporary (:sc unsigned-stack) tmp) + (:policy :fast-safe) + (:generator 3 + (cond ((sc-is value unsigned-stack) + (setf tmp value)) + ((sc-is result unsigned-stack) + (setf tmp result))) + (move tmp value) + (unless (location= result tmp) + (move result value)) + (let ((ea (make-ea :dword :base rbp-tn + :disp (frame-byte-offset (tn-offset tmp))))) + (inst ldmxcsr ea)))) + + (macrolet ((defvoid (name insn) + `(progn + (export ',name) + (defknown ,name () (values) ()) + (define-vop (,name) + (:translate ,name) + (:policy :fast-safe) + (:generator 1 + (inst ,insn)))))) + (defvoid cpu-load-fence lfence) + (defvoid cpu-store-fence sfence) + (defvoid cpu-memory-fence mfence) + (defvoid cpu-pause pause))) + +#+ecl +(progn + (def-intrinsic cpu-mxcsr () fixnum "_mm_getcsr") + (def-intrinsic %set-cpu-mxcsr (fixnum) fixnum "_mm_setcsr" :export nil :ret-arg 0) + + (defsetf cpu-mxcsr %set-cpu-mxcsr) + + (def-intrinsic cpu-load-fence () nil "_mm_lfence") + (def-intrinsic cpu-store-fence () nil "_mm_sfence") + (def-intrinsic cpu-memory-fence () nil "_mm_mfence") + + (def-intrinsic cpu-pause () nil "_mm_pause")) + +;;; Single-float + +;; Initialization + +#+sbcl +(def-float-set-intrinsic set-ss %set-ss single-float single-reg float-sse-pack movaps) + +#+ecl +(progn + (def-intrinsic set-ss (single-float) float-sse-pack "_mm_set_ss") + (def-intrinsic set1-ps (single-float) float-sse-pack "_mm_set1_ps") + + (def-intrinsic set-ps (single-float single-float single-float single-float) float-sse-pack "_mm_set_ps") + (def-intrinsic setr-ps (single-float single-float single-float single-float) float-sse-pack "_mm_setr_ps") + + (def-intrinsic setzero-ps () float-sse-pack "_mm_setzero_ps")) + +;; Memory + +(def-load-intrinsic mem-ref-ss float-sse-pack movss "_mm_load_ss") + +(def-load-intrinsic mem-ref-ps float-sse-pack movups "_mm_loadu_ps") +(def-load-intrinsic mem-ref-aps float-sse-pack movaps "_mm_load_ps") + +(def-store-intrinsic mem-set-ss float-sse-pack movss "_mm_store_ss" :setf-name mem-ref-ss) + +(def-store-intrinsic mem-set-ps float-sse-pack movups "_mm_storeu_ps" :setf-name mem-ref-ps) +(def-store-intrinsic mem-set-aps float-sse-pack movaps "_mm_store_ps" :setf-name mem-ref-aps) + +(def-store-intrinsic stream-ps float-sse-pack movntps "_mm_stream_ps") + +;; Arithmetics + +(def-binary-intrinsic add-ss float-sse-pack addss 3 "_mm_add_ss") +(def-binary-intrinsic add-ps float-sse-pack addps 3 "_mm_add_ps" :commutative t) +(def-binary-intrinsic sub-ss float-sse-pack subss 3 "_mm_sub_ss") +(def-binary-intrinsic sub-ps float-sse-pack subps 3 "_mm_sub_ps") +(def-binary-intrinsic mul-ss float-sse-pack mulss 5 "_mm_mul_ss") +(def-binary-intrinsic mul-ps float-sse-pack mulps 5 "_mm_mul_ps" :commutative t) +(def-binary-intrinsic div-ss float-sse-pack divss 13 "_mm_div_ss") +(def-binary-intrinsic div-ps float-sse-pack divps 13 "_mm_div_ps") +(def-binary-intrinsic min-ss float-sse-pack minss 3 "_mm_min_ss") +(def-binary-intrinsic min-ps float-sse-pack minps 3 "_mm_min_ps":commutative t) +(def-binary-intrinsic max-ss float-sse-pack maxss 3 "_mm_max_ss") +(def-binary-intrinsic max-ps float-sse-pack maxps 3 "_mm_max_ps" :commutative t) + +(def-unary-intrinsic sqrt-ss float-sse-pack sqrtss 20 "_mm_sqrt_ss" :partial t) +(def-unary-intrinsic sqrt-ps float-sse-pack sqrtps 20 "_mm_sqrt_ps") +(def-unary-intrinsic rsqrt-ss float-sse-pack rsqrtss 20 "_mm_rsqrt_ss" :partial t) +(def-unary-intrinsic rsqrt-ps float-sse-pack rsqrtps 20 "_mm_rsqrt_ps") +(def-unary-intrinsic rcp-ss float-sse-pack rcpss 13 "_mm_rcp_ss" :partial t) +(def-unary-intrinsic rcp-ps float-sse-pack rcpps 13 "_mm_rcp_ps") + +;; Bitwise logic + +#+sbcl +(def-not-intrinsic not-ps float-sse-pack xorps) + +(def-binary-intrinsic and-ps float-sse-pack andps 1 "_mm_and_ps" :commutative t) +(def-binary-intrinsic andnot-ps float-sse-pack andnps 1 "_mm_andnot_ps") +(def-binary-intrinsic or-ps float-sse-pack orps 1 "_mm_or_ps" :commutative t) +(def-binary-intrinsic xor-ps float-sse-pack xorps 1 "_mm_xor_ps" :commutative t) + +;; Comparisons + +(def-binary-intrinsic =-ss float-sse-pack cmpss 3 "_mm_cmpeq_ss" :tags (:eq)) +(def-binary-intrinsic =-ps float-sse-pack cmpps 3 "_mm_cmpeq_ps" :tags (:eq) :commutative t) +(def-binary-intrinsic <-ss float-sse-pack cmpss 3 "_mm_cmplt_ss" :tags (:lt)) +(def-binary-intrinsic <-ps float-sse-pack cmpps 3 "_mm_cmplt_ps" :tags (:lt)) +(def-binary-intrinsic <=-ss float-sse-pack cmpss 3 "_mm_cmple_ss" :tags (:le)) +(def-binary-intrinsic <=-ps float-sse-pack cmpps 3 "_mm_cmple_ps" :tags (:le)) +#+ecl +(def-binary-intrinsic >-ss float-sse-pack nil nil "_mm_cmpgt_ss") +#+ecl +(def-binary-intrinsic >-ps float-sse-pack nil nil "_mm_cmpgt_ps") +#+ecl +(def-binary-intrinsic >=-ss float-sse-pack nil nil "_mm_cmpge_ss") +#+ecl +(def-binary-intrinsic >=-ps float-sse-pack nil nil "_mm_cmpge_ps") + +(def-binary-intrinsic /=-ss float-sse-pack cmpss 3 "_mm_cmpneq_ss" :tags (:neq)) +(def-binary-intrinsic /=-ps float-sse-pack cmpps 3 "_mm_cmpneq_ps" :tags (:neq) :commutative t) +(def-binary-intrinsic /<-ss float-sse-pack cmpss 3 "_mm_cmpnlt_ss" :tags (:nlt)) +(def-binary-intrinsic /<-ps float-sse-pack cmpps 3 "_mm_cmpnlt_ps" :tags (:nlt)) +(def-binary-intrinsic /<=-ss float-sse-pack cmpss 3 "_mm_cmpnle_ss" :tags (:nle)) +(def-binary-intrinsic /<=-ps float-sse-pack cmpps 3 "_mm_cmpnle_ps" :tags (:nle)) +#+ecl +(def-binary-intrinsic />-ss float-sse-pack nil nil "_mm_cmpngt_ss") +#+ecl +(def-binary-intrinsic />-ps float-sse-pack nil nil "_mm_cmpngt_ps") +#+ecl +(def-binary-intrinsic />=-ss float-sse-pack nil nil "_mm_cmpnge_ss") +#+ecl +(def-binary-intrinsic />=-ps float-sse-pack nil nil "_mm_cmpnge_ps") + +(def-binary-intrinsic cmpord-ss float-sse-pack cmpss 3 "_mm_cmpord_ss" :tags (:ord)) ; neither is NaN +(def-binary-intrinsic cmpord-ps float-sse-pack cmpps 3 "_mm_cmpord_ps" :tags (:ord) :commutative t) +(def-binary-intrinsic cmpunord-ss float-sse-pack cmpss 3 "_mm_cmpunord_ss" :tags (:unord)) +(def-binary-intrinsic cmpunord-ps float-sse-pack cmpps 3 "_mm_cmpunord_ps" :tags (:unord) :commutative t) + +#| Skipped: _mm_u?comi.*_ss |# + +;; Misc + +(def-binary-intrinsic unpackhi-ps float-sse-pack unpckhps 1 "_mm_unpackhi_ps") +(def-binary-intrinsic unpacklo-ps float-sse-pack unpcklps 1 "_mm_unpacklo_ps") + +(def-binary-intrinsic move-ss float-sse-pack movss 1 "_mm_move_ss") + +(def-binary-intrinsic movehl-ps float-sse-pack movhlps 1 "_mm_movehl_ps") +(def-binary-intrinsic movelh-ps float-sse-pack movlhps 1 "_mm_movelh_ps") + +(def-unary-intrinsic movemask-ps (unsigned-byte 4) movmskps 1 "_mm_movemask_ps" :arg-type float-sse-pack) + +;; Shuffle + +(def-binary-intrinsic shuffle-ps float-sse-pack shufps 1 "_mm_shuffle_ps" :immediate-arg (unsigned-byte 8)) + +;; Conversion + +(def-unary-intrinsic convert-pi32-to-ps float-sse-pack cvtdq2ps 3 "_mm_cvtepi32_ps" :arg-type int-sse-pack) +(def-unary-intrinsic convert-ps-to-pi32 int-sse-pack cvtps2dq 3 "_mm_cvtps_epi32" :arg-type float-sse-pack) +(def-unary-intrinsic truncate-ps-to-pi32 int-sse-pack cvttps2dq 3 "_mm_cvttps_epi32" :arg-type float-sse-pack) + +(def-sse-int-intrinsic convert-si32-to-ss (signed-byte 32) float-sse-pack cvtsi2ss 3 "_mm_cvtsi32_ss") +(def-cvt-to-int32-intrinsic convert-ss-to-si32 (signed-byte 32) cvtss2si 3 "_mm_cvtss_si32" :arg-type float-sse-pack) +(def-cvt-to-int32-intrinsic truncate-ss-to-si32 (signed-byte 32) cvttss2si 3 "_mm_cvttss_si32" :arg-type float-sse-pack) + +#+(or x86_64 x86-64) +(def-sse-int-intrinsic convert-si64-to-ss (signed-byte 64) float-sse-pack cvtsi2ss 3 + #-msvc "_mm_cvtsi64_ss" #+msvc "_mm_cvtsi64x_ss") +#+(or x86_64 x86-64) +(def-unary-intrinsic convert-ss-to-si64 (signed-byte 64) cvtss2si 3 + #-msvc "_mm_cvtss_si64" #+msvc "_mm_cvtss_si64x" :arg-type float-sse-pack) +#+(or x86_64 x86-64) +(def-unary-intrinsic truncate-ss-to-si64 (signed-byte 64) cvttss2si 3 + #-msvc "_mm_cvttss_si64" #+msvc "_mm_cvttss_si64x" :arg-type float-sse-pack) + +;;; Double-float + +;; Initialization + +#+sbcl +(def-float-set-intrinsic set-sd %set-sd double-float double-reg double-sse-pack movapd) + +#+ecl +(progn + (def-intrinsic set-sd (double-float) double-sse-pack "_mm_set_sd") + (def-intrinsic set1-pd (double-float) double-sse-pack "_mm_set1_pd") + + (def-intrinsic set-pd (double-float double-float) double-sse-pack "_mm_set_pd") + (def-intrinsic setr-pd (double-float double-float) double-sse-pack "_mm_setr_pd") + + (def-intrinsic setzero-pd () double-sse-pack "_mm_setzero_pd")) + +;; Memory + +(def-load-intrinsic mem-ref-sd double-sse-pack movsd "_mm_load_sd") + +(def-load-intrinsic mem-ref-pd double-sse-pack movupd "_mm_loadu_pd") +(def-load-intrinsic mem-ref-apd double-sse-pack movapd "_mm_load_pd") + +(def-load-intrinsic loadh-pd double-sse-pack movhpd "_mm_loadh_pd" :register-arg t) +(def-load-intrinsic loadl-pd double-sse-pack movlpd "_mm_loadl_pd" :register-arg t) + +(def-store-intrinsic mem-set-sd double-sse-pack movsd "_mm_store_sd" :setf-name mem-ref-sd) + +(def-store-intrinsic mem-set-pd double-sse-pack movupd "_mm_storeu_pd" :setf-name mem-ref-pd) +(def-store-intrinsic mem-set-apd double-sse-pack movapd "_mm_store_pd" :setf-name mem-ref-apd) + +(def-store-intrinsic storeh-pd double-sse-pack movhpd "_mm_storeh_pd") +(def-store-intrinsic storel-pd double-sse-pack movlpd "_mm_storel_pd") + +(def-store-intrinsic stream-pd double-sse-pack movntpd "_mm_stream_pd") + +;; Arithmetics + +(def-binary-intrinsic add-sd double-sse-pack addsd 3 "_mm_add_sd") +(def-binary-intrinsic add-pd double-sse-pack addpd 3 "_mm_add_pd" :commutative t) +(def-binary-intrinsic sub-sd double-sse-pack subsd 3 "_mm_sub_sd") +(def-binary-intrinsic sub-pd double-sse-pack subpd 3 "_mm_sub_pd") +(def-binary-intrinsic mul-sd double-sse-pack mulsd 5 "_mm_mul_sd") +(def-binary-intrinsic mul-pd double-sse-pack mulpd 5 "_mm_mul_pd" :commutative t) +(def-binary-intrinsic div-sd double-sse-pack divsd 13 "_mm_div_sd") +(def-binary-intrinsic div-pd double-sse-pack divpd 13 "_mm_div_pd") +(def-binary-intrinsic min-sd double-sse-pack minsd 3 "_mm_min_sd") +(def-binary-intrinsic min-pd double-sse-pack minpd 3 "_mm_min_pd" :commutative t) +(def-binary-intrinsic max-sd double-sse-pack maxsd 3 "_mm_max_sd") +(def-binary-intrinsic max-pd double-sse-pack maxpd 3 "_mm_max_pd" :commutative t) + +(def-binary-intrinsic sqrt-sd double-sse-pack sqrtsd 20 "_mm_sqrt_sd") +(def-unary-intrinsic sqrt-pd double-sse-pack sqrtpd 20 "_mm_sqrt_pd") + +;; Bitwise logic + +#+sbcl +(def-not-intrinsic not-pd double-sse-pack xorpd) + +(def-binary-intrinsic and-pd double-sse-pack andpd 1 "_mm_and_pd" :commutative t) +(def-binary-intrinsic andnot-pd double-sse-pack andnpd 1 "_mm_andnot_pd") +(def-binary-intrinsic or-pd double-sse-pack orpd 1 "_mm_or_pd" :commutative t) +(def-binary-intrinsic xor-pd double-sse-pack xorpd 1 "_mm_xor_pd" :commutative t) + +;; Comparisons + +(def-binary-intrinsic =-sd double-sse-pack cmpsd 3 "_mm_cmpeq_sd" :tags (:eq)) +(def-binary-intrinsic =-pd double-sse-pack cmppd 3 "_mm_cmpeq_pd" :tags (:eq) :commutative t) +(def-binary-intrinsic <-sd double-sse-pack cmpsd 3 "_mm_cmplt_sd" :tags (:lt)) +(def-binary-intrinsic <-pd double-sse-pack cmppd 3 "_mm_cmplt_pd" :tags (:lt)) +(def-binary-intrinsic <=-sd double-sse-pack cmpsd 3 "_mm_cmple_sd" :tags (:le)) +(def-binary-intrinsic <=-pd double-sse-pack cmppd 3 "_mm_cmple_pd" :tags (:le)) +#+ecl +(def-binary-intrinsic >-sd double-sse-pack nil nil "_mm_cmpgt_sd") +#+ecl +(def-binary-intrinsic >-pd double-sse-pack nil nil "_mm_cmpgt_pd") +#+ecl +(def-binary-intrinsic >=-sd double-sse-pack nil nil "_mm_cmpge_sd") +#+ecl +(def-binary-intrinsic >=-pd double-sse-pack nil nil "_mm_cmpge_pd") + +(def-binary-intrinsic /=-sd double-sse-pack cmpsd 3 "_mm_cmpneq_sd" :tags (:neq)) +(def-binary-intrinsic /=-pd double-sse-pack cmppd 3 "_mm_cmpneq_pd" :tags (:neq) :commutative t) +(def-binary-intrinsic /<-sd double-sse-pack cmpsd 3 "_mm_cmpnlt_sd" :tags (:nlt)) +(def-binary-intrinsic /<-pd double-sse-pack cmppd 3 "_mm_cmpnlt_pd" :tags (:nlt)) +(def-binary-intrinsic /<=-sd double-sse-pack cmpsd 3 "_mm_cmpnle_sd" :tags (:nle)) +(def-binary-intrinsic /<=-pd double-sse-pack cmppd 3 "_mm_cmpnle_pd" :tags (:nle)) +#+ecl +(def-binary-intrinsic />-sd double-sse-pack nil nil "_mm_cmpngt_sd") +#+ecl +(def-binary-intrinsic />-pd double-sse-pack nil nil "_mm_cmpngt_pd") +#+ecl +(def-binary-intrinsic />=-sd double-sse-pack nil nil "_mm_cmpnge_sd") +#+ecl +(def-binary-intrinsic />=-pd double-sse-pack nil nil "_mm_cmpnge_pd") + +(def-binary-intrinsic cmpord-sd double-sse-pack cmpsd 3 "_mm_cmpord_sd" :tags (:ord)) ; neither is NaN +(def-binary-intrinsic cmpord-pd double-sse-pack cmppd 3 "_mm_cmpord_pd" :tags (:ord) :commutative t) +(def-binary-intrinsic cmpunord-sd double-sse-pack cmpsd 3 "_mm_cmpunord_sd" :tags (:unord)) +(def-binary-intrinsic cmpunord-pd double-sse-pack cmppd 3 "_mm_cmpunord_pd" :tags (:unord) :commutative t) + +;; Misc + +(def-binary-intrinsic unpackhi-pd double-sse-pack unpckhpd 1 "_mm_unpackhi_pd") +(def-binary-intrinsic unpacklo-pd double-sse-pack unpcklpd 1 "_mm_unpacklo_pd") + +(def-binary-intrinsic move-sd double-sse-pack movsd 1 "_mm_move_sd") + +(def-unary-intrinsic movemask-pd (unsigned-byte 2) movmskpd 1 "_mm_movemask_pd" :arg-type double-sse-pack) + +;; Shuffle + +(def-binary-intrinsic shuffle-pd double-sse-pack shufpd 1 "_mm_shuffle_pd" :immediate-arg (unsigned-byte 2)) + +;; Conversion + +(def-unary-intrinsic convert-ps-to-pd double-sse-pack cvtps2pd 3 "_mm_cvtps_pd" :arg-type float-sse-pack) +(def-unary-intrinsic convert-pd-to-ps float-sse-pack cvtpd2ps 3 "_mm_cvtpd_ps" :arg-type double-sse-pack) + +(def-binary-intrinsic convert-ss-to-sd double-sse-pack cvtss2sd 3 "_mm_cvtss_sd" :y-type float-sse-pack) +(def-binary-intrinsic convert-sd-to-ss float-sse-pack cvtsd2ss 3 "_mm_cvtsd_ss" :y-type double-sse-pack) + +(def-unary-intrinsic convert-pi32-to-pd double-sse-pack cvtdq2pd 3 "_mm_cvtepi32_pd" :arg-type int-sse-pack) +(def-unary-intrinsic convert-pd-to-pi32 int-sse-pack cvtpd2dq 3 "_mm_cvtpd_epi32" :arg-type double-sse-pack) +(def-unary-intrinsic truncate-pd-to-pi32 int-sse-pack cvttpd2dq 3 "_mm_cvttpd_epi32" :arg-type double-sse-pack) + +(def-sse-int-intrinsic convert-si32-to-sd (signed-byte 32) double-sse-pack cvtsi2ss 3 "_mm_cvtsi32_sd") +(def-cvt-to-int32-intrinsic convert-sd-to-si32 (signed-byte 32) cvtsd2si 3 "_mm_cvtsd_si32" :arg-type double-sse-pack) +(def-cvt-to-int32-intrinsic truncate-sd-to-si32 (signed-byte 32) cvttsd2si 3 "_mm_cvttsd_si32" :arg-type double-sse-pack) + +#+(or x86_64 x86-64) +(def-sse-int-intrinsic convert-si64-to-sd (signed-byte 64) double-sse-pack cvtsi2ss 3 + #-msvc "_mm_cvtsi64_sd" #+msvc "_mm_cvtsi64x_sd") +#+(or x86_64 x86-64) +(def-unary-intrinsic convert-sd-to-si64 (signed-byte 64) cvtsd2si 3 + #-msvc "_mm_cvtsd_si64" #+msvc "_mm_cvtsd_si64x" :arg-type double-sse-pack) +#+(or x86_64 x86-64) +(def-unary-intrinsic truncate-sd-to-si64 (signed-byte 64) cvttsd2si 3 + #-msvc "_mm_cvttsd_si64" #+msvc "_mm_cvttsd_si64x" :arg-type double-sse-pack) + +;;; Integer + +;; Initialization + +#+ecl +(progn + (def-intrinsic set1-pi8 (fixnum) int-sse-pack "_mm_set1_epi8") + (def-intrinsic set1-pi16 (fixnum) int-sse-pack "_mm_set1_epi16") + (def-intrinsic set1-pi32 (ext:integer32) int-sse-pack "_mm_set1_epi32") + #+x86_64 + (def-intrinsic set1-pi64 (ext:integer64) int-sse-pack "_mm_set1_epi64x") + + (def-intrinsic set1-pu32 (ext:byte32) int-sse-pack "_mm_set1_epi32") + #+x86_64 + (def-intrinsic set1-pu64 (ext:byte64) int-sse-pack "_mm_set1_epi64x") + + ;;----- + (def-intrinsic set-pi8 (fixnum fixnum fixnum fixnum + fixnum fixnum fixnum fixnum + fixnum fixnum fixnum fixnum + fixnum fixnum fixnum fixnum) int-sse-pack "_mm_set_epi8") + (def-intrinsic set-pi16 (fixnum fixnum fixnum fixnum + fixnum fixnum fixnum fixnum) int-sse-pack "_mm_set_epi16") + (def-intrinsic set-pi32 (ext:integer32 ext:integer32 ext:integer32 ext:integer32) int-sse-pack "_mm_set_epi32") + #+x86_64 + (def-intrinsic set-pi64 (ext:integer64 ext:integer64) int-sse-pack "_mm_set_epi64x") + + (def-intrinsic set-pu32 (ext:byte32 ext:byte32 ext:byte32 ext:byte32) int-sse-pack "_mm_set_epi32") + #+x86_64 + (def-intrinsic set-pu64 (ext:byte64 ext:byte64) int-sse-pack "_mm_set_epi64x") + + ;;----- + (def-intrinsic setr-pi8 (fixnum fixnum fixnum fixnum + fixnum fixnum fixnum fixnum + fixnum fixnum fixnum fixnum + fixnum fixnum fixnum fixnum) int-sse-pack "_mm_setr_epi8") + (def-intrinsic setr-pi16 (fixnum fixnum fixnum fixnum + fixnum fixnum fixnum fixnum) int-sse-pack "_mm_setr_epi16") + (def-intrinsic setr-pi32 (ext:integer32 ext:integer32 ext:integer32 ext:integer32) int-sse-pack "_mm_setr_epi32") + #+x86_64 + (def-intrinsic setr-pi64 (ext:integer64 ext:integer64) int-sse-pack "_mm_set_epi64x" :reorder-args t) + + (def-intrinsic setr-pu32 (ext:byte32 ext:byte32 ext:byte32 ext:byte32) int-sse-pack "_mm_setr_epi32") + #+x86_64 + (def-intrinsic setr-pu64 (ext:byte64 ext:byte64) int-sse-pack "_mm_set_epi64x" :reorder-args t) + + ;;----- + (def-intrinsic setzero-pi () int-sse-pack "_mm_setzero_si128")) + +;; Memory + +(def-load-intrinsic mem-ref-pi int-sse-pack movdqu "_mm_loadu_si128") +(def-load-intrinsic mem-ref-api int-sse-pack movdqa "_mm_load_si128") + +(def-load-intrinsic mem-ref-si64 int-sse-pack movd "_mm_loadl_epi64") + +(def-store-intrinsic mem-set-pi int-sse-pack movdqu "_mm_storeu_si128" :setf-name mem-ref-pi) +(def-store-intrinsic mem-set-api int-sse-pack movdqa "_mm_store_si128" :setf-name mem-ref-api) + +(def-store-intrinsic mem-set-si64 int-sse-pack movd "_mm_storel_epi64" :setf-name mem-ref-si64) + +(def-store-intrinsic stream-pi int-sse-pack movntdq "_mm_stream_si128") + +;; Masked move + +#+ecl +(def-mem-intrinsic maskmoveu-pi "char" nil "_mm_maskmoveu_si128" :prefix-args (int-sse-pack int-sse-pack)) + +#+sbcl +(progn + (defknown %maskmoveu-pi (sse-pack sse-pack system-area-pointer fixnum) (values) (unsafe)) + + (define-vop (%maskmoveu-pi) + (:translate %maskmoveu-pi) + (:args (value :scs (sse-reg)) + (mask :scs (sse-reg)) + (sap :scs (sap-reg) :target rdi) + (offset :scs (signed-reg))) + (:arg-types sse-pack sse-pack system-area-pointer signed-num) + (:temporary (:sc sap-reg :offset rdi-offset :from :eval) rdi) + (:policy :fast-safe) + (:note "inline MASKMOVEU operation") + (:generator 5 + (if (location= sap rdi) + (inst add rdi offset) + (inst lea rdi (make-ea :qword :base sap :index offset))) + (inst maskmovdqu value mask))) + + (define-vop (%maskmoveu-pi-c) + (:translate %maskmoveu-pi) + (:args (value :scs (sse-reg)) + (mask :scs (sse-reg)) + (sap :scs (sap-reg) :target rdi)) + (:arg-types sse-pack sse-pack system-area-pointer (:constant (signed-byte 32))) + (:info offset) + (:temporary (:sc sap-reg :offset rdi-offset :from :eval) rdi) + (:policy :fast-safe) + (:note "inline MASKMOVEU operation") + (:generator 4 + (if (location= sap rdi) + (unless (= offset 0) + (inst add rdi offset)) + (if (= offset 0) + (inst mov rdi sap) + (inst lea rdi (make-ea :qword :base sap :disp offset)))) + (inst maskmovdqu value mask))) + + (def-splice-transform %maskmoveu-pi (value mask (sap+ sap offset1) offset2) + (%maskmoveu-pi value mask sap (+ offset1 offset2)))) + +;; Arithmetics + +(def-binary-intrinsic add-pi8 int-sse-pack paddb 1 "_mm_add_epi8" :commutative t) +(def-binary-intrinsic add-pi16 int-sse-pack paddw 1 "_mm_add_epi16" :commutative t) +(def-binary-intrinsic add-pi32 int-sse-pack paddd 1 "_mm_add_epi32" :commutative t) +(def-binary-intrinsic add-pi64 int-sse-pack paddq 1 "_mm_add_epi64" :commutative t) + +(def-binary-intrinsic adds-pi8 int-sse-pack paddsb 1 "_mm_adds_epi8" :commutative t) +(def-binary-intrinsic adds-pi16 int-sse-pack paddsw 1 "_mm_adds_epi16" :commutative t) +(def-binary-intrinsic adds-pu8 int-sse-pack paddusb 1 "_mm_adds_epu8" :commutative t) +(def-binary-intrinsic adds-pu16 int-sse-pack paddusw 1 "_mm_adds_epu16" :commutative t) + +(def-binary-intrinsic avg-pu8 int-sse-pack pavgb 1 "_mm_avg_epu8" :commutative t) +(def-binary-intrinsic avg-pu16 int-sse-pack pavgw 1 "_mm_avg_epu16" :commutative t) + +(def-binary-intrinsic madd-pi16 int-sse-pack pmaddwd 1 "_mm_madd_epi16" :commutative t) + +(def-binary-intrinsic max-pu8 int-sse-pack pmaxub 1 "_mm_max_epu8" :commutative t) +(def-binary-intrinsic max-pi16 int-sse-pack pmaxsw 1 "_mm_max_epi16" :commutative t) +(def-binary-intrinsic min-pu8 int-sse-pack pminub 1 "_mm_min_epu8" :commutative t) +(def-binary-intrinsic min-pi16 int-sse-pack pminsw 1 "_mm_min_epi16" :commutative t) + +(def-binary-intrinsic mulhi-pi16 int-sse-pack pmulhw 3 "_mm_mulhi_epi16" :commutative t) +(def-binary-intrinsic mulhi-pu16 int-sse-pack pmulhuw 3 "_mm_mulhi_epu16" :commutative t) +(def-binary-intrinsic mullo-pi16 int-sse-pack pmullw 3 "_mm_mullo_epi16" :commutative t) + +(def-binary-intrinsic mul-pu32 int-sse-pack pmuludq 3 "_mm_mul_epu32" :commutative t) + +(def-binary-intrinsic sad-pu8 int-sse-pack psadbw 1 "_mm_sad_epu8" :commutative t) + +(def-binary-intrinsic sub-pi8 int-sse-pack psubb 1 "_mm_sub_epi8") +(def-binary-intrinsic sub-pi16 int-sse-pack psubw 1 "_mm_sub_epi16") +(def-binary-intrinsic sub-pi32 int-sse-pack psubd 1 "_mm_sub_epi32") +(def-binary-intrinsic sub-pi64 int-sse-pack psubq 1 "_mm_sub_epi64") + +(def-binary-intrinsic subs-pi8 int-sse-pack psubsb 1 "_mm_subs_epi8") +(def-binary-intrinsic subs-pi16 int-sse-pack psubsw 1 "_mm_subs_epi16") +(def-binary-intrinsic subs-pu8 int-sse-pack psubusb 1 "_mm_subs_epu8") +(def-binary-intrinsic subs-pu16 int-sse-pack psubusw 1 "_mm_subs_epu16") + +;; Bitwise logic + +#+sbcl +(def-not-intrinsic not-pi int-sse-pack pxor) + +(def-binary-intrinsic and-pi int-sse-pack pand 1 "_mm_and_si128" :commutative t) +(def-binary-intrinsic andnot-pi int-sse-pack pandn 1 "_mm_andnot_si128") +(def-binary-intrinsic or-pi int-sse-pack por 1 "_mm_or_si128" :commutative t) +(def-binary-intrinsic xor-pi int-sse-pack pxor 1 "_mm_xor_si128" :commutative t) + +;; Shifts + +(def-unary-intrinsic slli-pi int-sse-pack pslldq 1 "_mm_slli_si128" :partial :one-arg :immediate-arg (unsigned-byte 8)) + +(def-sse-int-intrinsic slli-pi16 fixnum int-sse-pack psllw 3 "_mm_slli_epi16" :make-temporary t) +(def-sse-int-intrinsic slli-pi32 fixnum int-sse-pack pslld 3 "_mm_slli_epi32" :make-temporary t) +(def-sse-int-intrinsic slli-pi64 fixnum int-sse-pack psllq 3 "_mm_slli_epi64" :make-temporary t) +(def-binary-intrinsic sll-pi16 int-sse-pack psllw 1 "_mm_sll_epi16") +(def-binary-intrinsic sll-pi32 int-sse-pack pslld 1 "_mm_sll_epi32") +(def-binary-intrinsic sll-pi64 int-sse-pack psllq 1 "_mm_sll_epi64") + +(def-sse-int-intrinsic srai-pi16 fixnum int-sse-pack psraw 3 "_mm_srai_epi16" :make-temporary t) +(def-sse-int-intrinsic srai-pi32 fixnum int-sse-pack psrad 3 "_mm_srai_epi32" :make-temporary t) +(def-binary-intrinsic sra-pi16 int-sse-pack psraw 1 "_mm_sra_epi16") +(def-binary-intrinsic sra-pi32 int-sse-pack psrad 1 "_mm_sra_epi32") + +(def-unary-intrinsic srli-pi int-sse-pack psrldq 1 "_mm_srli_si128" :partial :one-arg :immediate-arg (unsigned-byte 8)) + +(def-sse-int-intrinsic srli-pi16 fixnum int-sse-pack psrlw 3 "_mm_srli_epi16" :make-temporary t) +(def-sse-int-intrinsic srli-pi32 fixnum int-sse-pack psrld 3 "_mm_srli_epi32" :make-temporary t) +(def-sse-int-intrinsic srli-pi64 fixnum int-sse-pack psrlq 3 "_mm_srli_epi64" :make-temporary t) +(def-binary-intrinsic srl-pi16 int-sse-pack psrlw 1 "_mm_srl_epi16") +(def-binary-intrinsic srl-pi32 int-sse-pack psrld 1 "_mm_srl_epi32") +(def-binary-intrinsic srl-pi64 int-sse-pack psrlq 1 "_mm_srl_epi64") + +#+sbcl +(macrolet ((defimm (name insn bits &key arithmetic) + `(define-vop (,(symbolicate "%" name "-IMM") sse-int-base-op) + (:translate ,name) + (:args (x :scs (sse-reg) :target r)) + (:arg-types sse-pack (:constant fixnum)) + (:result-types sb-kernel:int-sse-pack) + (:info immv) + (:generator 1 + ,@(let ((core `(progn + (ensure-move int-sse-pack r x) + (unless (= immv 0) + (inst ,insn r immv))))) + (if arithmetic + `((when (or (< immv 0) (>= immv ,bits)) + (setf immv ,bits)) + ,core) + `((if (or (< immv 0) (>= immv ,bits)) + (inst pxor r r) + ,core)))))))) + (defimm slli-pi16 psllw-imm 16) + (defimm slli-pi32 pslld-imm 32) + (defimm slli-pi64 psllq-imm 64) + (defimm srai-pi16 psraw-imm 16 :arithmetic t) + (defimm srai-pi32 psrad-imm 32 :arithmetic t) + (defimm srli-pi16 psrlw-imm 16) + (defimm srli-pi32 psrld-imm 32) + (defimm srli-pi64 psrlq-imm 64)) + +;; Comparisons + +(def-binary-intrinsic =-pi8 int-sse-pack pcmpeqb 1 "_mm_cmpeq_epi8") +(def-binary-intrinsic =-pi16 int-sse-pack pcmpeqw 1 "_mm_cmpeq_epi16") +(def-binary-intrinsic =-pi32 int-sse-pack pcmpeqd 1 "_mm_cmpeq_epi32") + +#+ecl +(def-binary-intrinsic <-pi8 int-sse-pack nil nil "_mm_cmplt_epi8") +#+ecl +(def-binary-intrinsic <-pi16 int-sse-pack nil nil "_mm_cmplt_epi16") +#+ecl +(def-binary-intrinsic <-pi32 int-sse-pack nil nil "_mm_cmplt_epi32") + +(def-binary-intrinsic >-pi8 int-sse-pack pcmpgtb 1 "_mm_cmpgt_epi8") +(def-binary-intrinsic >-pi16 int-sse-pack pcmpgtw 1 "_mm_cmpgt_epi16") +(def-binary-intrinsic >-pi32 int-sse-pack pcmpgtd 1 "_mm_cmpgt_epi32") + +;; Misc + +(def-binary-intrinsic packs-pi16 int-sse-pack packsswb 1 "_mm_packs_epi16") +(def-binary-intrinsic packs-pi32 int-sse-pack packssdw 1 "_mm_packs_epi32") +(def-binary-intrinsic packus-pi16 int-sse-pack packuswb 1 "_mm_packus_epi16") + +(def-unary-intrinsic extract-pi16 (unsigned-byte 16) pextrw 1 "_mm_extract_epi16" + :immediate-arg (unsigned-byte 8) :arg-type int-sse-pack) +(def-sse-int-intrinsic insert-pi16 fixnum int-sse-pack pinsrw 1 "_mm_insert_epi16" + :immediate-arg (unsigned-byte 8)) + +(def-unary-intrinsic movemask-pi8 (unsigned-byte 16) pmovmskb 1 "_mm_movemask_epi8" :arg-type int-sse-pack) + +(def-binary-intrinsic unpackhi-pi8 int-sse-pack punpckhbw 1 "_mm_unpackhi_epi8") +(def-binary-intrinsic unpackhi-pi16 int-sse-pack punpckhwd 1 "_mm_unpackhi_epi16") +(def-binary-intrinsic unpackhi-pi32 int-sse-pack punpckhdq 1 "_mm_unpackhi_epi32") +(def-binary-intrinsic unpackhi-pi64 int-sse-pack punpckhqdq 1 "_mm_unpackhi_epi64") + +(def-binary-intrinsic unpacklo-pi8 int-sse-pack punpcklbw 1 "_mm_unpacklo_epi8") +(def-binary-intrinsic unpacklo-pi16 int-sse-pack punpcklwd 1 "_mm_unpacklo_epi16") +(def-binary-intrinsic unpacklo-pi32 int-sse-pack punpckldq 1 "_mm_unpacklo_epi32") +(def-binary-intrinsic unpacklo-pi64 int-sse-pack punpcklqdq 1 "_mm_unpacklo_epi64") + +(def-unary-intrinsic move-pi64 int-sse-pack movq 1 "_mm_move_epi64") + +;; Shuffle + +(def-unary-intrinsic shuffle-pi32 int-sse-pack pshufd 1 "_mm_shuffle_epi32" :immediate-arg (unsigned-byte 8)) +(def-unary-intrinsic shufflelo-pi16 int-sse-pack pshuflw 1 "_mm_shufflelo_epi16" :immediate-arg (unsigned-byte 8)) +(def-unary-intrinsic shufflehi-pi16 int-sse-pack pshufhw 1 "_mm_shufflehi_epi16" :immediate-arg (unsigned-byte 8)) + +;; Conversion + +#+sbcl +(progn + (export 'convert-si32-to-pi) + (defknown convert-si32-to-pi ((signed-byte 32)) int-sse-pack (foldable flushable)) + (export 'convert-su32-to-pi) + (defknown convert-su32-to-pi ((unsigned-byte 32)) int-sse-pack (foldable flushable)) + (export 'convert-si64-to-pi) + (defknown convert-si64-to-pi ((signed-byte 64)) int-sse-pack (foldable flushable)) + (export 'convert-su64-to-pi) + (defknown convert-su64-to-pi ((unsigned-byte 64)) int-sse-pack (foldable flushable)) + (defknown %set-int ((signed-byte 64)) int-sse-pack (foldable flushable always-translatable)) + (defknown %set-uint ((unsigned-byte 64)) int-sse-pack (foldable flushable always-translatable)) + + (define-vop (%set-int) + (:translate %set-int %set-uint + convert-si32-to-pi convert-su32-to-pi + convert-si64-to-pi convert-su64-to-pi) + (:args (arg :scs (signed-reg unsigned-reg signed-stack unsigned-stack))) + (:arg-types untagged-num) + (:results (dst :scs (sse-reg))) + (:result-types sb-kernel:int-sse-pack) + (:policy :fast-safe) + (:generator 1 + (inst movd dst arg)))) + +#+ecl +(progn + (def-intrinsic convert-si32-to-pi (ext:integer32) int-sse-pack "_mm_cvtsi32_si128") + (def-intrinsic convert-su32-to-pi (ext:byte32) int-sse-pack "_mm_cvtsi32_si128") + #+x86_64 + (def-intrinsic convert-si64-to-pi (ext:integer64) int-sse-pack #-msvc "_mm_cvtsi64_si128" #+msvc "_mm_cvtsi64x_si128") + #+x86_64 + (def-intrinsic convert-su64-to-pi (ext:byte64) int-sse-pack #-msvc "_mm_cvtsi64_si128" #+msvc "_mm_cvtsi64x_si128")) + +(def-cvt-to-int32-intrinsic convert-pi-to-si32 (signed-byte 32) movd 1 "_mm_cvtsi128_si32" + :arg-type int-sse-pack) +(def-unary-intrinsic convert-pi-to-su32 (unsigned-byte 32) movd 1 "_mm_cvtsi128_si32" + :result-size :dword :arg-type int-sse-pack) + +#+(or x86_64 x86-64) +(def-unary-intrinsic convert-pi-to-si64 (signed-byte 64) movd 1 + #-msvc "_mm_cvtsi128_si64" #+msvc "_mm_cvtsi128_si64x" :arg-type int-sse-pack) +#+(or x86_64 x86-64) +(def-unary-intrinsic convert-pi-to-su64 (unsigned-byte 64) movd 1 + #-msvc "_mm_cvtsi128_si64" #+msvc "_mm_cvtsi128_si64x" :arg-type int-sse-pack) + diff --git a/contrib/cl-simd/sse-package.lisp b/contrib/cl-simd/sse-package.lisp new file mode 100644 index 000000000..c87af2f81 --- /dev/null +++ b/contrib/cl-simd/sse-package.lisp @@ -0,0 +1,61 @@ +;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- +;;; +;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) +;;; +;;; This file defines a package for all SSE intrinsics. +;;; + +#+ecl +(eval-when (:load-toplevel) + (require 'cmp)) + +#+sbcl +(pushnew :SSE2 *features*) + +(defpackage #:SSE + #+sbcl + (:use #:COMMON-LISP #:SB-C #:SB-VM #:SB-INT #:SB-KERNEL #:SB-ASSEM #:SB-EXT #:SB-SYS) + #+sbcl + (:import-from #:SB-VM + #:SINGLE-REG #:DOUBLE-REG #:SSE-REG #:SSE-PACK-IMMEDIATE + #:SIGNED-REG #:SIGNED-STACK #:UNSIGNED-REG #:UNSIGNED-STACK + #:SIGNED-NUM #:UNSIGNED-NUM #:UNTAGGED-NUM #:IMMEDIATE + #:SAP-REG #:DESCRIPTOR-REG #:ANY-REG #:TAGGED-NUM + #:RAX-OFFSET #:RDI-OFFSET #:RBP-TN #:FRAME-BYTE-OFFSET + #:MAKE-EA #:REG-IN-SIZE #:LOADW) + #+sbcl + (:import-from #:SB-C + #:SPLICE-FUN-ARGS #:EXTRACT-FUN-ARGS + #:%DEFTRANSFORM #:COMMUTATIVE-ARG-SWAP + #:GIVE-UP-IR1-TRANSFORM #:ABORT-IR1-TRANSFORM + #:INSERT-ARRAY-BOUNDS-CHECKS #:VECTOR-LENGTH + #:ASSERT-ARRAY-RANK #:ASSERT-LVAR-TYPE + #:CONSTANT-LVAR-P #:LVAR-VALUE #:LVAR-TYPE + #:LEXENV-POLICY #:NODE-LEXENV + #:FIND-SAETP #:FIND-SAETP-BY-CTYPE) + #+sbcl + (:import-from #:SB-IMPL + #:%ARRAY-ROW-MAJOR-INDEX) + #+sbcl + (:shadow #:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK) + #+ecl + (:use #:COMMON-LISP #:FFI) + #+ecl + (:import-from #:EXT + #:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK + #:SSE-PACK-P #:ARRAY-ELEMENT-TYPE-BYTE-SIZE + #:*SSE-PACK-PRINT-MODE*) + #+ecl + (:shadow #:SSE-PACK) + ;; Common exports: + (:export #:SSE-PACK #:SSE-PACK-P + #:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK + #:*SSE-PACK-PRINT-MODE* + #:SSE-ARRAY #:MAKE-SSE-ARRAY + #:0.0-PS #:TRUE-SS #:FALSE-SS #:TRUE-PS #:FALSE-PS + #:SET1-PS #:SET-PS #:SETR-PS #:SETZERO-PS + #:0.0-PD #:TRUE-SD #:FALSE-SD #:TRUE-PD #:FALSE-PD + #:SET1-PD #:SET-PD #:SETR-PD #:SETZERO-PD + #:0-PI #:TRUE-PI #:FALSE-PI #:SETZERO-PI + #:CPU-MXCSR #:CPU-MXCSR-BITS #:WITH-SAVED-MXCSR #:CPU-CONFIGURE-ROUNDING)) + diff --git a/contrib/cl-simd/sse-utils.lisp b/contrib/cl-simd/sse-utils.lisp new file mode 100644 index 000000000..ffe65d3e4 --- /dev/null +++ b/contrib/cl-simd/sse-utils.lisp @@ -0,0 +1,128 @@ +;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- +;;; +;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) +;;; +;;; This file implements some common utility functions. +;;; + +(in-package #:SSE) + +;;; CPU control + +(eval-when (:compile-toplevel :load-toplevel :execute) + (declaim (ftype (function (&rest t) (unsigned-byte 32)) cpu-mxcsr-bits)) + (defun cpu-mxcsr-bits (&rest tags) + (loop with mask = 0 + for tag in tags + for bit = (if (listp tag) + (apply #'cpu-mxcsr-bits tag) + (ecase tag + (:except-invalid #x1) + (:except-denormal #x2) + (:except-divide-zero #x4) + (:except-overflow #x8) + (:except-underflow #x10) + (:except-precision #x20) + (:except-all #x3F) + (:denormals-are-zero #x40) + (:mask-invalid #x80) + (:mask-denormal #x100) + (:mask-divide-zero #x200) + (:mask-overflow #x400) + (:mask-underflow #x800) + (:mask-precision #x1000) + (:mask-all #x1f80) + (:round-nearest 0) + (:round-negative #x2000) + (:round-positive #x4000) + (:round-zero #x6000) + (:round-bits #x6000) + (:flush-to-zero #x8000))) + do (setf mask (logior mask bit)) + finally (return mask))) + (defun expand-cpu-mxcsr-bits (tags on-fail) + (loop for tag in tags + when (keywordp tag) collect tag into kwds + else collect tag into rest + finally + (return + (cond ((and kwds rest) + `(logior ,(apply #'cpu-mxcsr-bits kwds) + (cpu-mxcsr-bits ,@rest))) + (kwds + (apply #'cpu-mxcsr-bits kwds)) + (t on-fail)))))) + +(define-compiler-macro cpu-mxcsr-bits (&whole whole &rest tags) + (expand-cpu-mxcsr-bits tags whole)) + +(defmacro with-saved-mxcsr (&body code) + (let ((v (gensym "CSR"))) + `(let ((,v (cpu-mxcsr))) + (declare (type (unsigned-byte 32) ,v) + #+ecl (:read-only ,v)) + (unwind-protect (progn ,@code) + (%set-cpu-mxcsr ,v))))) + +#+nil +(defun cpu-check-exceptions (&rest tags) + (let ((mask (logand (cpu-mxcsr-bits (or tags :except-all)) + (cpu-mxcsr-bits :except-all))) + (csr (get-cpu-mxcsr))) + (declare (optimize (safety 0) (speed 3) (debug 0)) + (type fixnum csr mask)) + (not (zerop (logand mask csr))))) + +#+nil +(define-compiler-macro cpu-check-exceptions (&whole whole &rest tags) + (let ((bits (expand-cpu-mxcsr-bits (or tags '(except-all)) nil))) + (if (integerp bits) + `(locally (declare (optimize (speed 3) (safety 0) (debug 0))) + (not (zerop (logand (cpu-get-mxcsr) + ,(logand bits (cpu-mxcsr-bits :except-all)))))) + whole))) + +#+nil +(macrolet ((foo (&rest names) + (let* ((kwds (mapcar (lambda (x) (intern (format nil "MASK-~A" x) :keyword)) names)) + (pvars (mapcar (lambda (x) (intern (format nil "~A-P" x))) names))) + `(defun cpu-mask-exceptions (&key + ,@(mapcar (lambda (n p) `(,n nil ,p)) names pvars) + (other nil rest-p)) + (let ((set-bits (logior ,@(mapcar (lambda (n k) `(if ,n (cpu-mxcsr-bits ,k) 0)) names kwds))) + (arg-bits (logior ,@(mapcar (lambda (p k) `(if ,p (cpu-mxcsr-bits ,k) 0)) pvars kwds)))) + (%set-cpu-mxcsr + (the fixnum + (if (not rest-p) + (logior set-bits (logand (get-cpu-mxcsr) (lognot arg-bits))) + (logior set-bits + (if other (logand (cpu-mxcsr-bits :mask-all) (lognot arg-bits)) 0) + (logiand (get-cpu-mxcsr) (lognot (cpu-mxcsr-bits :mask-all))))))) + nil))))) + (foo invalid denormal divide-zero overflow underflow precision)) + +(defun cpu-configure-rounding (&key round-to + (denormals-are-zero nil daz-p) + (flush-to-zero nil ftz-p)) + (let ((set 0) + (mask 0)) + (when round-to + (setf mask (cpu-mxcsr-bits :round-bits) + set (ecase round-to + (:zero (cpu-mxcsr-bits :round-zero)) + (:negative (cpu-mxcsr-bits :round-negative)) + (:positive (cpu-mxcsr-bits :round-positive)) + (:nearest (cpu-mxcsr-bits :round-nearest))))) + (when daz-p + (setf mask (logior mask (cpu-mxcsr-bits :denormals-are-zero))) + (when denormals-are-zero + (setf set (logior set (cpu-mxcsr-bits :denormals-are-zero))))) + (when ftz-p + (setf mask (logior mask (cpu-mxcsr-bits :flush-to-zero))) + (when flush-to-zero + (setf set (logior set (cpu-mxcsr-bits :flush-to-zero))))) + (setf (cpu-mxcsr) + (the (unsigned-byte 32) + (logior set (logand (cpu-mxcsr) (lognot mask))))) + nil)) + diff --git a/contrib/cl-simd/test-sfmt.lisp b/contrib/cl-simd/test-sfmt.lisp new file mode 100644 index 000000000..517fa016a --- /dev/null +++ b/contrib/cl-simd/test-sfmt.lisp @@ -0,0 +1,156 @@ +;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- +;;; +;;; Dumbly translated from C code at: http://github.com/jj1bdx/sfmt-extstate + +(eval-when (:compile-toplevel :load-toplevel :execute) + (require :cl-simd)) + +(defpackage #:sfmt-test + (:use #:common-lisp #:sse)) + +(in-package #:sfmt-test) + +(deftype uint32 () '(unsigned-byte 32)) +(deftype uint32-vector () '(sse-array uint32 (*))) + +(defconstant +mexp+ 19937) +(defconstant +n+ (1+ (floor +mexp+ 128))) +(defconstant +pos1+ 122) +(defconstant +sl1+ 18) +(defconstant +sl2+ 1) +(defconstant +sr1+ 11) +(defconstant +sr2+ 1) + +(defconstant +msk1+ #xdfffffef) +(defconstant +msk2+ #xddfecb7f) +(defconstant +msk3+ #xbffaffff) +(defconstant +msk4+ #xbffffff6) + +(defconstant +parity1+ #x00000001) +(defconstant +parity2+ #x00000000) +(defconstant +parity3+ #x00000000) +(defconstant +parity4+ #x13c9e684) + +(defconstant +uint32-mask+ #xFFFFFFFF) + +(defvar *work-buffer* (make-sse-array (* +n+ 4) :element-type 'uint32)) + +(defun period-certification (buffer) + (declare (type uint32-vector buffer)) + (let ((inner (logxor (logand (aref buffer 0) +parity1+) + (logand (aref buffer 1) +parity2+) + (logand (aref buffer 2) +parity3+) + (logand (aref buffer 3) +parity4+)))) + (loop for i = 16 then (ash i -1) while (> i 0) + do (setf inner (logxor inner (ash inner (- i))))) + (when (logtest inner 1) + (return-from period-certification))) + (loop + for i from 0 to 3 + for parity in (load-time-value (list +parity1+ +parity2+ +parity3+ +parity4+)) + do (loop + for work = 1 then (ash work 1) + for j from 0 below 32 + when (/= 0 (logand work parity)) + do (progn + (setf (aref buffer i) + (logxor (aref buffer i) work)) + (return-from period-certification))))) + +(defun init-gen-rand (seed buffer) + (declare (type uint32 seed) + (type uint32-vector buffer)) + (setf (aref buffer 0) seed) + (loop for i from 1 below (array-total-size buffer) + do (setf (aref buffer i) + (logand +uint32-mask+ + (+ i + (* 1812433253 (logxor (aref buffer (1- i)) + (ash (aref buffer (1- i)) -30))))))) + (period-certification buffer)) + +;; Should be an inline function, but it's broken in ECL +(defmacro recursion (a b c d mask) + `(let ((x ,a) + (y (srli-pi32 ,b +sr1+)) + (z (srli-pi ,c +sr2+)) + (v (slli-pi32 ,d +sl1+)) + (m ,mask)) + (xor-pi (xor-pi (xor-pi z x) v) + (xor-pi (slli-pi x +sl2+) + (and-pi y m))))) + +(defmacro sfmt-aref (buf idx) + `(row-major-aref-api ,buf (the fixnum (* 4 (the fixnum ,idx))))) + +(defun gen-rand-all (buffer) + (declare (optimize (speed 3) #+ecl (safety 0) (debug 0) + #+sbcl (sb-c::insert-array-bounds-checks 0)) + (type uint32-vector buffer)) + #+ecl (check-type buffer uint32-vector) + (assert (= (array-total-size buffer) (* +n+ 4))) + (let ((mask (set-pu32 +msk4+ +msk3+ +msk2+ +msk1+)) + (r1 (sfmt-aref buffer (- +n+ 2))) + (r2 (sfmt-aref buffer (- +n+ 1)))) + (declare (type int-sse-pack mask r1 r2)) + (macrolet ((twist (delta) + `(psetq r1 r2 + r2 (setf (sfmt-aref buffer i) + (recursion (sfmt-aref buffer i) + (sfmt-aref buffer (+ i (the fixnum ,delta))) + r1 r2 mask))))) + (loop for i fixnum from 0 below (- +n+ +pos1+) + do (twist +pos1+)) + (loop for i fixnum from (- +n+ +pos1+) below +n+ + do (twist (- +pos1+ +n+)))))) + +(defun gen-rand-array (output buffer) + (declare (optimize (speed 3) #+ecl (safety 0) (debug 0) + #+sbcl (sb-c::insert-array-bounds-checks 0)) + (type uint32-vector buffer output)) + #+ecl (check-type buffer uint32-vector) + #+ecl (check-type output uint32-vector) + (assert (= (array-total-size buffer) (* +n+ 4))) + (let ((mask (set-pu32 +msk4+ +msk3+ +msk2+ +msk1+)) + (size (floor (array-total-size output) 4)) + (r1 (sfmt-aref buffer (- +n+ 2))) + (r2 (sfmt-aref buffer (- +n+ 1)))) + (declare (type int-sse-pack mask r1 r2) + (type fixnum size)) + (assert (> size (* +n+ 2))) + (macrolet ((twist (tgt src1 delta1 src2 delta2) + `(psetq r1 r2 + r2 (setf (sfmt-aref ,tgt i) + (recursion (sfmt-aref ,src1 (- i (the fixnum ,delta1))) + (sfmt-aref ,src2 (+ i (the fixnum ,delta2))) + r1 r2 mask))))) + (loop for i fixnum from 0 below (- +n+ +pos1+) + do (twist output buffer 0 buffer +pos1+)) + (loop for i fixnum from (- +n+ +pos1+) below +n+ + do (twist output buffer 0 output (- +pos1+ +n+))) + (loop for i fixnum from +n+ below (- size +n+) + do (twist output output +n+ output (- +pos1+ +n+))) + #+ () + (loop for j fixnum from 0 below (- (* 2 +n+) size) + do (setf (sfmt-aref buffer j) + (sfmt-aref output (+ j (the fixnum (- size +n+)))))) + (loop + for i fixnum from (- size +n+) below size + for j fixnum from 0 below +n+ ;(max 0 (- (* 2 +n+) size)) + do (twist output output +n+ output (- +pos1+ +n+)) + do (setf (sfmt-aref buffer j) r2)) + output))) + +(defun test () + (let ((out (make-sse-array 10000 :element-type 'uint32))) + (init-gen-rand 1234 *work-buffer*) + (gen-rand-array out *work-buffer*) + (assert (equal (coerce (subseq out 995 1000) 'list) + '(2499610950 3057240914 1662679783 461224431 1168395933))) + (gen-rand-array out *work-buffer*) + (assert (equal (coerce (subseq out 995 1000) 'list) + '(648219337 458306832 3674950976 4030368244 2918117049))))) + +(dotimes (i 10) + (test)) +