[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

some PCL patches



I have made a number of changes to PCL (8/28/88) after examining the
performance of
Chris Burdorf's carwash program in KCL.  The carwash program (with 20 cars)
makes about 10000 method calls, and about 90% of those calls are to slot
accessor
methods.  

The first thing I noticed was that GET/SET-SLOT-VALUE-CACHE-MISS was called
a significant fraction of all slot accesses, so I found an alternative to
the
use of %LOGAND with the cache-mask with better behavior, which eliminated 
cache collisions for the carwash program.  This only improved things
slightly.

The next thing I tried was to eliminate the use of
MAKE-INDIVIDUAL-METHOD-DCODE.
This made the carwash program run 7 times faster!
I think that in KCL (and possibly in some other implementations), it is
never worthwile
to use MAKE-INDIVIDUAL-METHOD-DCODE.  Also, in some implementations, 
MAKE-INDIVIDUAL-METHOD-DCODE might be faster than MAKE-MULTI-METHOD-DCODE, 
but slower than MAKE-ALL-STD-CLASS-READERS-DCODE (which doesn't actually
have
to call any method).

The carwash program (with 20 cars) now runs in about .4 seconds in KCL on a
SUN4,
when compiled with safety 0 (this is the minimum time of several trials); 
without the changes marked with #+xkcl the program runs in .9 seconds; and
before
making any of these changes to PCL it ran in about 7.8 seconds.  With 50
cars,
the minimum run time of 20 trials was .93 seconds, and the average run time
was
1.3 seconds (the difference here is due to gc, which ran 8 times in the 20
trials).

The version of KCL that I use includes the akcl-1-79 patches by Bill
Schelter, 
and some patches I have made myself; the changes in this message will work
in any 
version of KCL, or, for that matter, any CL implementation supported by
PCL.  


Rick Harris


(in-package "PCL")

;; in low.lisp
(defmacro %ash (integer count)
  `(locally (declare (optimize (speed 3) (safety 0)))
     (the fixnum (ash (the fixnum ,integer) ,count))))

(defmacro %mod (x y)
  `(locally (declare (optimize (speed 3) (safety 0)))
     (the fixnum (mod (the fixnum ,x) (the fixnum ,y)))))

(defun primep (x)
  (dotimes (i (1- (isqrt x)) t)
    (when (zerop (mod x (+ i 2)))
      (return nil))))

(defun largest-prime (x)
  (loop (when (primep x)
	  (return x))
	(decf x)))

;; Try to make every bit of the fixnum argument affect the result.
;; This should probably replace %logand everywhere in PCL:
;;   object-cache-no, cache-key-from-wrappers*, lookup-pv-*.
(defmacro fixnum-cache-no (mask fixnum)
  (unless (constantp mask)
    (error "FIXNUM-CACHE-NO requires its first argument (mask) to be a
constant"))
  (let* ((mask-value (eval mask))
	 (lsize (integer-length mask-value))
	 (size (ash 1 lsize))
	 (words-per-entry (1+ (logxor mask-value (1- size))))
	 (lwords-per-entry (integer-length (1- words-per-entry)))
	 (entries (/ size words-per-entry)))
    `(%ash (%mod ,fixnum ,(largest-prime entries)) ,lwords-per-entry)))


;; in kcl-low.lisp
#+kcl
(progn
;; The reason these are here is because the KCL compiler does
;; not allow LET to return FIXNUM values as values of (c) type int, hence
;; the use of LOCALLY (which expands into (LET () (DECLARE ...) ...))
;; forces conversion of ints to objects.
(defmacro %logand (&rest args)
  (reduce-variadic-to-binary 'logand args 0 t 'fixnum))

(defmacro %logxor (&rest args)
  (reduce-variadic-to-binary 'logxor args 0 t 'fixnum))

(defmacro %+ (&rest args)
  (reduce-variadic-to-binary '+ args 0 t 'fixnum))

(defmacro %1+ (x)
  `(the fixnum (1+ (the fixnum ,x))))

(defmacro %svref (vector index)
  `(svref (the simple-vector ,vector) (the fixnum ,index)))

(defsetf %svref (vector index) (new-value)
  `(setf (svref (the simple-vector ,vector) (the fixnum ,index))
         ,new-value))

(defmacro %ash (integer count)
  `(the fixnum (ash (the fixnum ,integer) ,count)))

(defmacro %mod (x y) ; this differs from mod when y is negative
  `(let ((%x ,x) (%y ,y))
     (declare (fixnum %x %y))
     (if (< %x 0)
	 (let ((%r (rem %x %y)))
	   (declare (fixnum %r))
	   (if (zerop %r)
	       %r
	       (the fixnum (- %y %r))))
	 (the fixnum (rem %x %y)))))

(setf (get 'cclosure-env 'compiler::inline-always)
      (list '((t) t nil nil "(#0)->cc.cc_env")))

#+xkcl ; kcl can't use turbo-closures because of a gc bug in mark_object;
xkcl fixes the bug
(progn
(CLines
  "object tc_cc_env_nthcdr (n,tc)"
  "object n,tc;                        "
  "{return (type_of(tc)==t_cclosure&&  "
  "         tc->cc.cc_turbo!=NULL&&    "
  "         type_of(n)==t_fixnum)?     "
  "         tc->cc.cc_turbo[fix(n)]:   " ; assume that n is in bounds
  "         Cnil;                      "
  "}                                   "
  )

(defentry tc-cclosure-env-nthcdr (object object) (object tc_cc_env_nthcdr))

(setf (get 'tc-cclosure-env-nthcdr 'compiler::inline-unsafe)
      '(((fixnum t) t nil nil "(#1)->cc.cc_turbo[#0]")))
)
)


;; in fin.lisp
#+kcl
(progn
(defun allocate-funcallable-instance-1 ()
  (let ((fin (allocate-funcallable-instance-2))
	(env
	  (make-list funcallable-instance-closure-size :initial-element nil)))
    (set-cclosure-env fin env)
    #+xkcl (si:turbo-closure fin)
    (dotimes (i (1- funcallable-instance-closure-size)) (pop env))
    (setf (car env) *funcallable-instance-marker*)
    fin))

;; this replaces funcallable-instance-data-1,
set-funcallable-instance-data-1, and the defsetf
(defmacro funcallable-instance-data-1 (fin data &environment env)
  ;; The stupid compiler won't expand macros before deciding on
optimizations,
  ;; so we must do it here.
  (let* ((pos-form (macroexpand `(funcallable-instance-data-position ,data)
env))
	 (index-form (if (constantp pos-form)
			 (- funcallable-instance-closure-size (eval pos-form) 2)
			 `(- funcallable-instance-closure-size
			     (funcallable-instance-data-position ,data)
			     2))))
    #+xkcl `(car (tc-cclosure-env-nthcdr ,index-form ,fin))
    #-xkcl `(nth ,index-form (cclosure-env ,fin))))
)


;; in slots.lisp
(defmacro slot-value-cache-offset (instance wrapper sxhash)
  `(fixnum-cache-no slot-value-cache-mask
     (%logxor (validate-wrapper ,instance ,wrapper)
              ,sxhash)))

(defmacro get-slot-value-1 (instance wrapper static-slots sxhash slot-name)
  `(let ((offset (slot-value-cache-offset ,instance ,wrapper ,sxhash))
	 (cache *slot-value-cache*)
	 (pos 0))
     (declare (fixnum offset pos))
     (without-interrupts
       (if (and (eq (%svref cache offset) ,wrapper)
		(eq (%svref cache (%+ offset 1)) ,slot-name))
	   (progn
	     (setq pos (%svref cache (%+ offset 2)))
	     (with-interrupts
	       (get-slot-value-2
		 ,instance ,wrapper ,slot-name ,static-slots pos)))
	   (with-interrupts
	     (get-slot-value-cache-miss
	       ,instance ,wrapper ,static-slots ,slot-name offset))))))

(defmacro set-slot-value-1 (nv instance wrapper static-slots sxhash
slot-name)
  `(let ((offset (slot-value-cache-offset ,instance ,wrapper ,sxhash))
	 (cache *slot-value-cache*)
	 (pos 0))
     (declare (fixnum offset pos))
     (without-interrupts
       (if (and (eq (%svref cache offset) ,wrapper)
		(eq (%svref cache (%+ offset 1)) ,slot-name))
	   (progn
	     (setq pos (%svref cache (%+ offset 2)))
	     (with-interrupts
	       (set-slot-value-2
		 ,nv ,instance ,wrapper ,slot-name ,static-slots pos)))
	  (with-interrupts
	    (set-slot-value-cache-miss
	      ,nv ,instance ,wrapper ,static-slots ,slot-name offset))))))

(defun compute-discriminator-code-1 (generic-function)
  (let ((combined (generic-function-combined-methods generic-function))
        (methods (generic-function-methods generic-function))
	(std-class (find-class 'standard-class))
	(t-class *the-class-t*)
	(r/w nil))
    (cond ((null methods)
	   (make-no-methods-dcode generic-function))
	  ((and (null (cdr combined))
		(every #'(lambda (x) (eq x t-class)) (caar combined)))
	   (make-default-method-only-dcode generic-function))
	  #||
	  ((dolist (e combined)
	     (when (dolist (specl (car e))
		     (when (listp specl) (return 't))))
	     (return 't))
	   (make-individual-method-dcode generic-function))
	  ||#
	  ((not
	     (dolist (m methods)
	       (let* ((specls (method-type-specifiers m))
		      (spec0 (car specls))
		      (spec1 (cadr specls)))
		 (cond ((and (memq r/w '(nil r))
			     (standard-reader-method-p m)
			     (not (listp spec0))
			     (if (symbolp spec0)
				 (and (neq spec0 'standard-generic-function)
				      (neq spec0 'generic-function))
				 (eq (class-of spec0) std-class)))
			(setq r/w 'r))
		       ((and (memq r/w '(nil w))
			     (standard-writer-method-p m)
			     (not (listp spec1))
			     (if (symbolp spec1)
				 (and (neq spec1 'standard-generic-function)
				      (neq spec1 'generic-function))
				 (eq (class-of spec1) std-class)))
			(setq r/w 'w))
		       (t
			(return t))))))
	   (if (eq r/w 'r)
	       (make-all-std-class-readers-dcode generic-function) 
	       (make-all-std-class-writers-dcode generic-function)))
          (t
           (make-multi-method-dcode generic-function)))))

;; End of patches to PCL

To fix (any version of) KCL so that it can use the changes marked with
#+xkcl: 
edit the function mark_object in the file kcl/c/gbc.c,
changing the lines:
	case t_cclosure:
		mark_object(x->cc.cc_name);
		mark_object(x->cc.cc_env);
		mark_object(x->cc.cc_data);
		if (x->cc.cc_start == NULL)
			break;
		if (what_to_collect == t_contiguous) {
			if (get_mark_bit((int *)(x->cc.cc_start)))
				break;
			mark_contblock(x->cc.cc_start, x->cc.cc_size);
			if (x->cc.cc_turbo != NULL) {
				for (i = 0, y = x->cc.cc_env;
				     type_of(y) == t_cons;
				     i++, y = y->c.c_cdr);
				mark_contblock((char *)(x->cc.cc_turbo),
					       i*sizeof(object));
			}
		}
		break;
to
	case t_cclosure:
		mark_object(x->cc.cc_name);
		mark_object(x->cc.cc_env);
		mark_object(x->cc.cc_data);
		if (what_to_collect == t_contiguous)
		        if (x->cc.cc_turbo != NULL) {
				for (i = 0, y = x->cc.cc_env;
				     type_of(y) == t_cons;
				     i++, y = y->c.c_cdr);
				mark_contblock((char *)(x->cc.cc_turbo),
					       i*sizeof(object));
			}
		if (x->cc.cc_start == NULL)
			break;
		if (what_to_collect == t_contiguous) {
			if (get_mark_bit((int *)(x->cc.cc_start)))
				break;
			mark_contblock(x->cc.cc_start, x->cc.cc_size);
		}
		break;