;; I keep finding that I need a function which will partition a sequence into runs of things. ;; For instance, you might want '(1 1 1 2 3 3 4 4 4 3 3 3 5 5) ;; to go to: '((1 1 1) (2) (3 3) (4 4 4) (3 3 3) (5 5)) ;; Which is what partition-by does: (partition-by identity '(1 1 1 2 3 3 4 4 4 3 3 3 5 5)) ;-> ((1 1 1) (2) (3 3) (4 4 4) (3 3 3) (5 5)) ;; But partition-by isn't quite what I want. ;; I'd like to be able to turn '( 1 2 3 4 1 2 3 2 3 4 6 9 10) ;; Into '((1 2 3 4) (1 2 3) (2 3 4) (6) (9 10)) ;; By defining a comparator like: (defn ^:dynamic sameish [a b] ( = (inc a) b)) ;; And then saying: ;; (partition-by-equivalence sameish '( 1 2 3 4 1 2 3 2 3 4 6 9 10)) ;; Doing this by hand: ;; () () '( 1 2 3 4 1 2 3 2 3 4 6 9 10) ;; -> ;; () (1) (2 3 4 1 2 3 2 3 4 6 9 10) ;; -> ;; () (1 2) (3 4 1 2 3 2 3 4 6 9 10) ;; -> ;; () (1 2 3) (4 1 2 3 2 3 4 6 9 10) ;; -> ;; () (1 2 3 4) (1 2 3 2 3 4 6 9 10) ;; -> test is false, start a new list ;; ((1 2 3 4)) (1) (2 3 2 3 4 6 9 10) ;; -> ;; ((1 2 3 4)) (1 2) (3 2 3 4 6 9 10) ;; -> ;; ((1 2 3 4)) (1 2 3) (2 3 4 6 9 10) ;; -> test is false, start a new list ;; ((1 2 3 4) (1 2 3)) (2 3 4 6 9 10) ;; -> ;; ((1 2 3 4) (1 2 3) (2)) (3 4 6 9 10) ;; -> etc ;; makes me think that this looks like a tail recursion with two accumulators (defn ^:dynamic recaccacc [ f acc1 acc2 coll] (if (empty? coll) (cons acc2 acc1) (if (empty? acc2) (recaccacc f acc1 (cons (first coll) acc2) (rest coll)) (if (f (first acc2) (first coll)) (recaccacc f acc1 (cons (first coll) acc2) (rest coll)) (recaccacc f (cons acc2 acc1) '() coll))))) ;; Unfortunately, this comes out backwards (use 'clojure.tools.trace) (dotrace [recaccacc] (recaccacc sameish '() '() '(1 2 3 4 1 2 3 2 3 4 6 9 10))) ;; TRACE t1169: (recaccacc #<user$sameish user$sameish@11b99c4> () () (1 2 3 4 1 2 3 2 3 4 6 9 10)) ;; TRACE t1170: | (recaccacc #<user$sameish user$sameish@11b99c4> () (1) (2 3 4 1 2 3 2 3 4 6 9 10)) ;; TRACE t1171: | | (recaccacc #<user$sameish user$sameish@11b99c4> () (2 1) (3 4 1 2 3 2 3 4 6 9 10)) ;; TRACE t1172: | | | (recaccacc #<user$sameish user$sameish@11b99c4> () (3 2 1) (4 1 2 3 2 3 4 6 9 10)) ;; TRACE t1173: | | | | (recaccacc #<user$sameish user$sameish@11b99c4> () (4 3 2 1) (1 2 3 2 3 4 6 9 10)) ;; TRACE t1174: | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((4 3 2 1)) () (1 2 3 2 3 4 6 9 10)) ;; TRACE t1175: | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((4 3 2 1)) (1) (2 3 2 3 4 6 9 10)) ;; TRACE t1176: | | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((4 3 2 1)) (2 1) (3 2 3 4 6 9 10)) ;; TRACE t1177: | | | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((4 3 2 1)) (3 2 1) (2 3 4 6 9 10)) ;; TRACE t1178: | | | | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((3 2 1) (4 3 2 1)) () (2 3 4 6 9 10)) ;; TRACE t1179: | | | | | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((3 2 1) (4 3 2 1)) (2) (3 4 6 9 10)) ;; TRACE t1180: | | | | | | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((3 2 1) (4 3 2 1)) (3 2) (4 6 9 10)) ;; TRACE t1181: | | | | | | | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((3 2 1) (4 3 2 1)) (4 3 2) (6 9 10)) ;; TRACE t1182: | | | | | | | | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((4 3 2) (3 2 1) (4 3 2 1)) () (6 9 10)) ;; TRACE t1183: | | | | | | | | | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((4 3 2) (3 2 1) (4 3 2 1)) (6) (9 10)) ;; TRACE t1184: | | | | | | | | | | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((6) (4 3 2) (3 2 1) (4 3 2 1)) () (9 10)) ;; TRACE t1185: | | | | | | | | | | | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((6) (4 3 2) (3 2 1) (4 3 2 1)) (9) (10)) ;; TRACE t1186: | | | | | | | | | | | | | | | | | (recaccacc #<user$sameish user$sameish@11b99c4> ((6) (4 3 2) (3 2 1) (4 3 2 1)) (10 9) ()) ;; TRACE t1186: | | | | | | | | | | | | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1185: | | | | | | | | | | | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1184: | | | | | | | | | | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1183: | | | | | | | | | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1182: | | | | | | | | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1181: | | | | | | | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1180: | | | | | | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1179: | | | | | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1178: | | | | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1177: | | | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1176: | | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1175: | | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1174: | | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1173: | | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1172: | | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1171: | | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1170: | => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; TRACE t1169: => ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;-> ((10 9) (6) (4 3 2) (3 2 1) (4 3 2 1)) ;; Which we can fix: (reverse (map reverse (recaccacc sameish '() '() '(1 2 3 4 1 2 3 2 3 4 6 9 10)))) ;-> ((1 2 3 4) (1 2 3) (2 3 4) (6) (9 10)) ;; Hooray! ;; So our first definition is (defn partition-by-equivalence [f coll] (let [recaccacc (fn [f acc1 acc2 coll] (if (empty? coll) (reverse (cons (reverse acc2) acc1)) (if (empty? acc2) (recur f acc1 (cons (first coll) acc2) (rest coll)) (if (f (first acc2) (first coll)) (recur f acc1 (cons (first coll) acc2) (rest coll)) (recur f (cons (reverse acc2) acc1) '() coll)))))] (recaccacc f '() '() coll))) (partition-by-equivalence sameish '(1 2 3 4 1 2 3 2 3 4 6 9 10)) ;-> ((1 2 3 4) (1 2 3) (2 3 4) (6) (9 10)) (partition-by-equivalence sameish '()) ;-> (()) (partition-by-equivalence sameish '(1)) ;-> ((1)) (partition-by-equivalence sameish '(1 1)) ;-> ((1) (1)) (partition-by-equivalence sameish '(1 2)) ;-> ((1 2)) (partition-by-equivalence sameish '(1 2 1)) ;-> ((1 2) (1)) (partition-by-equivalence sameish '(1 2 1 1)) ;-> ((1 2) (1) (1)) (partition-by-equivalence sameish '(1 2 1 1 2 2)) ;-> ((1 2) (1) (1 2) (2)) ;; Here's some incomprehensible maths-stuff about numbers of digits and logarithms and so on. (map count (partition-by (fn[a] (int (Math/log a))) (range 1 10000))) ; (2 5 13 34 94 255 693 1884 5123 1896) (partition-by-equivalence (fn [a b] (= (int (Math/log a)) (int (Math/log b)))) (range 1 100)) ;-> ((1 2) (3 4 5 6 7) (8 9 10 11 12 13 14 15 16 17 18 19 20) (21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54) (55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99)) (partition-by-equivalence (fn [a b] (= (int (Math/log10 a)) (int (Math/log10 b)))) (range 1 100)) ;-> ((1 2 3 4 5 6 7 8 9) (10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99)) ;; ascending subsequences (partition-by-equivalence <= '(1 2 3 3 4 5 7 8 9 1 2 5 6 1 7 8)) ;-> ((1 2 3 3 4 5 7 8 9) (1 2 5 6) (1 7 8)) ;; strictly ascending subsequences (partition-by-equivalence < '(1 2 3 3 4 5 7 8 9 1 2 5 6 1 7 8)) ;-> ((1 2 3) (3 4 5 7 8 9) (1 2 5 6) (1 7 8)) ;; lengths of increasing runs (map count (partition-by-equivalence <= '(1 2 3 3 4 5 7 8 9 1 2 5 6 1 7 8)) ) ;-> (9 4 3) ;; lengths of decreasing ones (map count (partition-by-equivalence >= '(1 2 3 3 4 5 7 8 9 1 2 5 6 1 7 8)) ) ;-> (1 1 2 1 1 1 1 2 1 1 2 1 1) ;; and finally, a simplified version of the latest problem I actually needed this for, pulling a sequence of lists of scores out of a log file ;; so that each full score list only appears once, and all its ancestors are discarded. (map last (partition-by-equivalence (fn[a b] (= a (drop 1 b))) '( () (1) (2 1) (3 2 1) () (9) (7 9)))) ;-> ((3 2 1) (7 9)) ;; It's a strict generalization of partition-by (defn my-partition-by [f coll] (partition-by-equivalence (fn[a b] (= (f a) (f b))) coll)) (map #(/ (Math/log %) (Math/log 2)) (range 1 100)) (my-partition-by #(int(/ (Math/log %) (Math/log 2))) (range 1 100)) ;; And I think it's a really nice function, which is helpful in all sort of situations. ;; It should be possible to make it completely lazy, so that it can take infinite inputs without wolfing the lot.

## Wednesday, March 27, 2013

### partition-by-equivalence

### Rerum Cognoscere Causas IV : How to Deal with Larger Samples

;; Rerum Cognoscere Causas IV : How to Deal with Larger Samples. ;; So far we have: ;; Our data, generated at random from a secret algorithm: (def ^:dynamic *randomizer* (java.util.Random. 0)) (defn rand-int [n] (.nextInt *randomizer* n)) (defn D6 [] (inc (rand-int 6))) (defn three-D6 [] (reduce + (repeatedly 3 D6))) (defn three-from-four-D6 [] (reduce + (drop 1 (sort (repeatedly 4 D6))))) (defn mixed [] (if (zero? (rand-int 10)) (three-from-four-D6) (three-D6))) (defn first-edition [] {:str (three-D6) :int (three-D6)}) (defn second-edition [] {:str (mixed) :int (mixed)}) (defn third-edition [] (if (zero? (rand-int 10)) {:str (three-from-four-D6) :int (three-from-four-D6)} {:str (three-D6) :int (three-D6)})) (def village (binding [*randomizer* (java.util.Random. 0)] (doall (repeatedly 100 (case (rand-int 3) 0 first-edition 1 second-edition 2 third-edition))))) ;; And the calculations of our sages, who have determined with prodigious effort: ;; the sides on a six sided die (def r16 (range 1 7)) ;; the probabilities of each result for each suggested method of generating a characteristic (def threed6f (frequencies (for [i r16 j r16 k r16] (reduce + [i j k])))) (def fourd6drop1f (frequencies (for [i r16 j r16 k r16 l r16] (reduce + (drop 1 (sort [i j k l])))))) (defn p3d6 [char] (/ (threed6f char) (reduce + (vals threed6f)))) (defn p4d6drop1 [char] (/ (fourd6drop1f char) (reduce + (vals fourd6drop1f)))) (defn pmixed [char] (+ (* 9/10 (p3d6 char)) (* 1/10 (p4d6drop1 char)))) ;; And thus the probabilities of a villager with particular characteristics coming into being under their scheme (def ptrad (memoize (fn [{:keys [str int]}] (* (p3d6 int) (p3d6 str))))) (def pindep(memoize (fn [{:keys [str int]}] (* (pmixed int) (pmixed str))))) (def pcommon (memoize (fn [{:keys [str int]}] (+ (* 9/10 (p3d6 int) (p3d6 str)) (* 1/10 (p4d6drop1 int) (p4d6drop1 str)))))) ;; using these calculations, we can take beliefs (def prior [1 1 1]) ;; and update them when we find new evidence. (defn update [beliefs villager] (map * ((juxt ptrad pindep pcommon) villager) beliefs)) ;; since ratios are a bit unreadable, this function will allow us to render them as (approximate) percentages. (defn approx-odds [[a b c]] (let [m (/ (+ a b c) 100)] (mapv int [(/ a m) (/ b m) (/ c m)]))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Earlier we found that there's not really enough information in our village to convince anyone one way or the other. (approx-odds (reduce update prior village)) ;-> [15 31 52] ;; It points half-heartedly towards a common cause model, but we wouldn't be surprised if that was the wrong answer. ;; As it happens, our village is just one of ten in the district (def district (binding [*randomizer* (java.util.Random. 0)] (doall (repeatedly 1000 (case (rand-int 3) 0 first-edition 1 second-edition 2 third-edition))))) (def villages (partition 100 district)) ;; paranoid check (= village (first villages)) ;-> true ;; So let's see what conclusions we can draw from each individual village (for [v villages] (approx-odds (reduce update prior v))) ;-> ([15 31 52] [53 29 17] [29 38 32] [63 21 14] [34 37 27] [36 27 36] [39 38 22] [49 30 19] [48 24 27] [20 37 42]) ;; Some villages are pointing one way, and some the other. ;; We might want to consider the district as a whole: ;; (approx-odds (reduce update prior district)) ;; But unfortunately this expression takes a while to evaluate. Can you see why? ;; Here's a clue: (reduce update [1.0 1.0 1.0] (take 100 district)) ;-> (1.0019121633199549E-224 2.0261656590064221E-224 3.398128024413593E-224) ;; And rather more worryingly (reduce update [1.0 1.0 1.0] district) ;-> (0.0 0.0 0.0) ;; It looks as though mathematics itself is failing us! ;; We need to find a new way to multiply numbers. ;; Fortunately we can add their logarithms instead: (+ (Math/log 6) (Math/log 6)) ;-> 3.58351893845611 (Math/log (* 6 6)) ;-> 3.58351893845611 ;; So let's make a logarithmic version of our update function (defn log-update [beliefs villager] (map + (map #(Math/log %) ((juxt ptrad pindep pcommon) villager)) beliefs)) ;; And of our prior: (def log-prior (map #(Math/log %) prior)) ;; And try that on the village (reduce log-update log-prior village) ;; (-515.7771504932035 -515.0729156616442 -514.5558361317242) ;; If we know the logs, we can get the numbers we actually want: (map #(Math/exp %) (reduce log-update log-prior village)) ;-> (1.0019121633198053E-224 2.0261656590065323E-224 3.398128024414225E-224) ;; Notice that these numbers are not exactly the same as the numbers calculated above. ;; Exercise for the reader: which of the two answers we can calculate is closer to the answer we would like to calculate but can't? ;; We can then turn those numbers into percentages (approx-odds (map #(Math/exp %) (reduce log-update log-prior village))) ;-> [15 31 52] ;; So we can do our calculation on the district as a whole: (reduce log-update log-prior district) ;;-> (-4967.368738149676 -4968.862029975447 -4970.195021140233) ;; Hooray, now we have the logarithms of the numbers we really want! ;; That's done our probability calculation for us, but unfortunately we can't actually recover ;; the number that -4967.36... is the logarithm of: (Math/exp -4967) ;; Because it's too small to fit into floating point arithmetic. ;; So we need to pull another rabbit out of our hat: ;; All we're interested in is the ratios of the three likelihoods. ;; That's not affected by multiplying through by a constant (approx-odds [2 1 1]) ; [50 25 25] (approx-odds [6 3 3]) ; [50 25 25] (approx-odds [1332 666 666]) ; [50 25 25] ;; Similarly, if the likelihoods are expressed as logarithms, the odds ratio ;; isn't affected by adding a constant. (approx-odds (map #(Math/exp %) [2 1 1])) ; [57 21 21] (approx-odds (map #(Math/exp %) [3 2 2])) ; [57 21 21] (approx-odds (map #(Math/exp %) [4 3 3])) ; [57 21 21] ;; What we'd like to be able to calculate is what this would be: ;; (approx-odds (map #(Math/exp %) '(-4967.368738149676 -4968.862029975447 -4970.195021140233))) ;; But we can't, since the Math/exp function is broken on numbers this low. ;; So we'll calculate instead (adding 4967 to each log-likelihood): (approx-odds (map #(Math/exp %) '(-0.368738149676 -1.862029975447 -3.195021140233))) ;-> [77 17 4] ;; So after looking at a full thousand people, with perfect models and ;; a perfect prior, looking for what you would have thought was a ;; pretty obvious effect, the existence of gifted superbeings, we're ;; still in a bit of a dubious position. ;; If we decide 'that's good enough', and declare that we live in a ;; first e'dition world, then we've still got a fair chance of being ;; wrong. ;; We can generalize our procedure thus: (defn log-update [beliefs villager] (doall (map + (map #(Math/log %) ((juxt ptrad pindep pcommon) villager)) beliefs))) ; the doall is papering over a bug in clojure's lazy sequences (defn log-posterior [log-prior data] (reduce log-update log-prior data)) (defn log-prior [& s] (map #(Math/log %) s)) (defn percentages-from-log-beliefs [beliefs] (let [bmax (apply max beliefs) odds (for [b beliefs] (Math/exp (- b bmax)))] (approx-odds odds))) ;; paranoid checking again: (percentages-from-log-beliefs (log-posterior (log-prior 1 1 1) village)) ;-> [15 31 52] (percentages-from-log-beliefs (log-posterior (log-prior 1 1 1) district)) ;-> [77 17 4] ;; And look at an even larger sample: (def country (binding [*randomizer* (java.util.Random. 0)] (doall (repeatedly 10000 (case (rand-int 3) 0 first-edition 1 second-edition 2 third-edition))))) (percentages-from-log-beliefs (log-posterior (log-prior 1 1 1) (take 100 country))) ; [15 31 52] (percentages-from-log-beliefs (log-posterior (log-prior 1 1 1) (take 300 country))) ; [27 39 32] (percentages-from-log-beliefs (log-posterior (log-prior 1 1 1) (take 1000 country))) ; [77 17 4] (percentages-from-log-beliefs (log-posterior (log-prior 1 1 1) (take 1500 country))) ; [96 3 0] (percentages-from-log-beliefs (log-posterior (log-prior 1 1 1) (take 3000 country))) ; [99 0 0] ;; Finally, this looks decisive! ;; And everyone agrees. (Their prior beliefs are *overwhelmed* by the evidence) (percentages-from-log-beliefs (log-posterior (log-prior 10 1 1) (take 3000 country))) ; [99 0 0] (percentages-from-log-beliefs (log-posterior (log-prior 1 10 1) (take 3000 country))) ; [99 0 0] (percentages-from-log-beliefs (log-posterior (log-prior 1 1 10) (take 3000 country))) ; [99 0 0] ;; If we're completely paranoid: (percentages-from-log-beliefs (log-posterior (log-prior 1 1 1) (take 10000 country))) ; [100 0 0] ;; That's close enough for government work. ;; So, in summary, we're looking at a situation where we understand perfectly what's going on, but can't directly see which of the three alternatives ;; (a) everyone's on a bell curve ;; (b) it's not quite a bell curve, it's a bit biased towards good scores ;; (c) there is a sub-race of gifted superbeings ;; was chosen by our Dungeon Master. We just have to look at characteristic scores. ;; And all our measurements are perfect, there is no noise in the data at all, and no systematic bias in our sampling. ;; And we are using, as far as I know, platonically ideal statistical ;; methods perfectly suited to the problem in hand, and guaranteed to ;; extract absolutely all the significance from our data that there ;; is. ;; And still, we need one thousand data points to get a feel for which way the wind is blowing (one hundred was actively misleading!) ;; And rather more than that to make people change their minds. ;; I would be most interested to know if anyone thinks I've done this analysis wrongly. ;; Things I am not quite certain of are the randomness of the random number generator and the behaviour of the floating point numbers. ;; I'm reasonably confident of the basic method of comparing the three models, and so that means I'll be even more interested and grateful ;; if someone can show me that I'm talking rubbish. ;; If it's true, it makes me wonder how it is possible to know anything at all about anything interesting. ;; Next time I see a study claiming to show something, I might see if I can make this sort of analysis work on it.

### Rerum Cognoscere Causas III : What can we tell from our small sample?

;; Rerum Cognoscere Causas III : What can we tell from our small sample? ;; So far we have: ;; Our data, generated at random from a secret algorithm: (def ^:dynamic *randomizer* (java.util.Random. 0)) (defn rand-int [n] (.nextInt *randomizer* n)) (defn D6 [] (inc (rand-int 6))) (defn three-D6 [] (reduce + (repeatedly 3 D6))) (defn three-from-four-D6 [] (reduce + (drop 1 (sort (repeatedly 4 D6))))) (defn mixed [] (if (zero? (rand-int 10)) (three-from-four-D6) (three-D6))) (defn first-edition [] {:str (three-D6) :int (three-D6)}) (defn second-edition [] {:str (mixed) :int (mixed)}) (defn third-edition [] (if (zero? (rand-int 10)) {:str (three-from-four-D6) :int (three-from-four-D6)} {:str (three-D6) :int (three-D6)})) (def village (binding [*randomizer* (java.util.Random. 0)] (doall (repeatedly 100 (case (rand-int 3) 0 first-edition 1 second-edition 2 third-edition))))) village ;-> ({:str 13, :int 18} {:str 11, :int 18} {:str 14, :int 15} {:str 6, :int 12} {:str 14, :int 13} {:str 18, :int 10} {:str 15, :int 11} {:str 12, :int 15} {:str 7, :int 8} {:str 16, :int 12} {:str 8, :int 7} {:str 9, :int 14} {:str 10, :int 9} {:str 11, :int 10} {:str 5, :int 10} {:str 7, :int 9} {:str 9, :int 13} {:str 12, :int 9} {:str 13, :int 9} {:str 5, :int 9} {:str 8, :int 13} {:str 9, :int 11} {:str 13, :int 14} {:str 12, :int 14} {:str 12, :int 17} {:str 14, :int 9} {:str 10, :int 11} {:str 18, :int 17} {:str 11, :int 9} {:str 8, :int 9} {:str 15, :int 13} {:str 8, :int 5} {:str 11, :int 9} {:str 10, :int 8} {:str 9, :int 12} {:str 5, :int 11} {:str 10, :int 7} {:str 9, :int 14} {:str 11, :int 9} {:str 11, :int 12} {:str 12, :int 13} {:str 15, :int 9} {:str 12, :int 12} {:str 6, :int 13} {:str 5, :int 4} {:str 12, :int 13} {:str 15, :int 10} {:str 14, :int 14} {:str 11, :int 4} {:str 12, :int 9} {:str 10, :int 12} {:str 7, :int 12} {:str 8, :int 11} {:str 10, :int 10} {:str 9, :int 8} {:str 8, :int 12} {:str 7, :int 9} {:str 13, :int 3} {:str 14, :int 9} {:str 8, :int 9} {:str 10, :int 11} {:str 15, :int 4} {:str 10, :int 11} {:str 8, :int 10} {:str 15, :int 10} {:str 8, :int 13} {:str 12, :int 5} {:str 8, :int 16} {:str 4, :int 8} {:str 10, :int 18} {:str 12, :int 12} {:str 11, :int 10} {:str 12, :int 8} {:str 12, :int 13} {:str 8, :int 12} {:str 9, :int 12} {:str 12, :int 10} {:str 15, :int 10} {:str 8, :int 11} {:str 7, :int 11} {:str 4, :int 8} {:str 12, :int 11} {:str 13, :int 9} {:str 14, :int 13} {:str 5, :int 9} {:str 17, :int 10} {:str 8, :int 13} {:str 9, :int 10} {:str 5, :int 14} {:str 15, :int 12} {:str 13, :int 13} {:str 11, :int 8} {:str 8, :int 6} {:str 12, :int 8} {:str 10, :int 3} {:str 14, :int 9} {:str 15, :int 12} {:str 15, :int 14} {:str 6, :int 10} {:str 16, :int 13}) ;; And the calculations of our sages, who have determined with prodigious effort: ;; the sides on a six sided die (def r16 (range 1 7)) ;; the probabilities of each result for each suggested method of generating a characteristic (def threed6f (frequencies (for [i r16 j r16 k r16] (reduce + [i j k])))) (def fourd6drop1f (frequencies (for [i r16 j r16 k r16 l r16] (reduce + (drop 1 (sort [i j k l])))))) (defn p3d6 [char] (/ (threed6f char) (reduce + (vals threed6f)))) (defn p4d6drop1 [char] (/ (fourd6drop1f char) (reduce + (vals fourd6drop1f)))) (defn pmixed [char] (+ (* 9/10 (p3d6 char)) (* 1/10 (p4d6drop1 char)))) ;; And thus the probabilities of a villager with particular characteristics coming into being under their scheme (def ptrad (memoize (fn [{:keys [str int]}] (* (p3d6 int) (p3d6 str))))) (def pindep(memoize (fn [{:keys [str int]}] (* (pmixed int) (pmixed str))))) (def pcommon (memoize (fn [{:keys [str int]}] (+ (* 9/10 (p3d6 int) (p3d6 str)) (* 1/10 (p4d6drop1 int) (p4d6drop1 str)))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Very strong, very clever villagers are more likely under the common ;; cause model than under the independent mixed model and even less ;; likely under the traditional model: (apply < ((juxt ptrad pindep pcommon) {:str 18 :int 18})) ;-> true ;; Let us imagine a neutral observer, employed to fairly determine the correctness of the three schools. ;; He starts off looking at each school's argument, and can see no way ;; to decide between the two, so he assigns them odds of 1:1:1 , ;; meaning that he thinks each one is equally likely, or equivalently, ;; that he will take or place bets on any of the three schools at odds ;; of 2:1 against ( 1 gold piece gets you 2 if you picked right ), ;; (plus a small commission for bookkeeping and accepting risk.) (def prior [1 1 1]) ;; He now considers the first villager: (def Olaf (first village)) ;; Who is strong, and extremely clever: Olaf ;-> {:str 13, :int 18} ;; And he considers the probability that Olaf would exist under each of the models. (ptrad Olaf) ;-> 7/15552 (pindep Olaf) ;-> 653/1119744 (pcommon Olaf) ;-> 217/349920 ;; Having observed some data, he now considers that he should re-weight his beliefs in the three models accordingly: (map * ((juxt ptrad pindep pcommon) Olaf) prior) ;-> (7/15552 653/1119744 217/349920) ;; A ratio of odds can always be rescaled. 10:5 is the same as 2:1. (map #(float (* 15552/7 %)) (map * ((juxt ptrad pindep pcommon) Olaf) prior)) ;-> (1.0 1.2956349 1.3777778) ;; So here's a function to take any odds ratio and turn it into (approximate) percentages (defn approx-odds [[a b c]] (let [m (/ (+ a b c) 100)] (mapv int [(/ a m) (/ b m) (/ c m)]))) (approx-odds (map * ((juxt ptrad pindep pcommon) Olaf) prior)) ;-> [27 35 37] ;; And we can now see that the arbitrator's judgement has shifted a little away from the traditionalists, and towards the second and third e'ditions. ;; This is reasonable, given that he has just observed a man who is both cleverer and stronger than would be expected by the traditionalists ;; Suppose he had seen Magnus instead: (def Magnus (second village)) Magnus ;-> {:str 11, :int 18} ;; Magnus is average in strength, but very clever. This is probably more likely under the independent rules than it is under the common cause rules. (approx-odds (map * ((juxt ptrad pindep pcommon) Magnus) prior)) ;-> [28 35 35] ;; Seeing Magnus should change a neutral person's beliefs towards the second e'dition, mostly at the expense of the first. ;; But of course, our observer has seen both: (approx-odds (map * ((juxt ptrad pindep pcommon) Magnus) (map * ((juxt ptrad pindep pcommon) Olaf) prior))) ;-> [23 37 39] ;; Implying that the combined effect of both men is to discredit the traditional school while slightly favouring the common-cause hypothesis. ;; We could generate the series representing how the assessors beliefs should change as he ;; considers each villager like this: (reductions (fn [beliefs villager] (map * ((juxt ptrad pindep pcommon) villager) beliefs)) prior (list Magnus Olaf)) ;-> ([1 1 1] (1/1728 803/1119744 247/349920) (7/26873856 524359/1253826625536 53599/122444006400)) ;; More readably, we can separate the function which updates our beliefs given a datum. (defn update [beliefs villager] (map * ((juxt ptrad pindep pcommon) villager) beliefs)) (map approx-odds (reductions update prior (list Magnus Olaf))) ;-> ([33 33 33] [28 35 35] [23 37 39]) ;; What if we look at the first ten villagers? (map approx-odds (reductions update prior (take 10 village))) ;-> ([33 33 33] [27 35 37] [23 37 39] [19 37 42] [20 37 41] [18 38 43] [16 40 43] [14 41 44] [13 41 45] [14 40 45] [12 40 46]) ;; The first twenty: (approx-odds (reduce update prior (take 20 village))) ;-> [19 36 44] ;; And at the whole village? (approx-odds (reduce update prior (take 100 village))) ;;-> [15 31 52] ;; So if we look at our whole village, it looks as though we'd be ;; slightly more confident that we lived in a third e'dition world. ;; But we're really not terribly confident about that. ;; We know that the models and priors are spot on, since ;; we've seen the source code for the world. ;; But even then, if we declared on the basis of this one village that ;; we lived in a third edition world, we'd literally expect to be ;; wrong half the time. ;; That's only a slight improvement on being wrong two thirds of the ;; time, which we'd expect if we hadn't bothered to look at any data at ;; all. Most of our opinion is coming from our prior beliefs and there ;; is really very little evidence in the data that we have. ;; Another way to look at this is to ask what happens to the beliefs of the three competing philosophical schools. ;; Suppose each of the three is initially so convinced of its own ;; correctness that it will place or lay bets at 5:1 on its pet ;; hypothesis. But that each school will adjust its beliefs as it sees evidence. ;; So the first e'dition guys will suffer a serious loss of confidence (approx-odds (reduce update [10 1 1] (take 100 village))) ; [64 13 22] ;; But not actually change their minds, just the strength of their convictions. ;; The second e'dition guys barely notice. (approx-odds (reduce update [1 10 1] (take 100 village))) ; [4 82 13] ;; While the third e'dition find their beliefs slightly strengthened ;; after looking at the evidence. (approx-odds (reduce update [1 1 10] (take 100 village))) ; [2 5 91] ;; Clearly more research is needed!

## Tuesday, March 5, 2013

### Rerum Cognoscere Causas II : In Which Divers Probabilities are Deduced Simply by Counting

;; Rerum Cognoscere Causas II : In Which Divers Probabilities are Deduced Simply by Counting ;; The problem: Which e'dition's rules were used to generate a village? (def ^:dynamic *randomizer* (java.util.Random. 0)) (defn rand-int [n] (.nextInt *randomizer* n)) (defn D6 [] (inc (rand-int 6))) (defn three-D6 [] (reduce + (for [i (range 3)] (D6)))) (defn three-from-four-D6 [] (reduce + (drop 1 (sort (for [i (range 4)] (D6)))))) (defn mixed [] (if (zero? (rand-int 10)) (three-from-four-D6) (three-D6))) (defn first-edition [] {:str (three-D6) :int (three-D6)}) (defn second-edition [] {:str (mixed) :int (mixed)}) (defn third-edition [] (if (zero? (rand-int 10)) {:str (three-from-four-D6) :int (three-from-four-D6)} {:str (three-D6) :int (three-D6)})) (def village (binding [*randomizer* (java.util.Random. 0)] (doall (repeatedly 100 (case (rand-int 3) 0 first-edition 1 second-edition 2 third-edition))))) village ;-> ({:str 13, :int 18} {:str 11, :int 18} {:str 14, :int 15} {:str 6, :int 12} {:str 14, :int 13} {:str 18, :int 10} {:str 15, :int 11} {:str 12, :int 15} {:str 7, :int 8} {:str 16, :int 12} {:str 8, :int 7} {:str 9, :int 14} {:str 10, :int 9} {:str 11, :int 10} {:str 5, :int 10} {:str 7, :int 9} {:str 9, :int 13} {:str 12, :int 9} {:str 13, :int 9} {:str 5, :int 9} {:str 8, :int 13} {:str 9, :int 11} {:str 13, :int 14} {:str 12, :int 14} {:str 12, :int 17} {:str 14, :int 9} {:str 10, :int 11} {:str 18, :int 17} {:str 11, :int 9} {:str 8, :int 9} {:str 15, :int 13} {:str 8, :int 5} {:str 11, :int 9} {:str 10, :int 8} {:str 9, :int 12} {:str 5, :int 11} {:str 10, :int 7} {:str 9, :int 14} {:str 11, :int 9} {:str 11, :int 12} {:str 12, :int 13} {:str 15, :int 9} {:str 12, :int 12} {:str 6, :int 13} {:str 5, :int 4} {:str 12, :int 13} {:str 15, :int 10} {:str 14, :int 14} {:str 11, :int 4} {:str 12, :int 9} {:str 10, :int 12} {:str 7, :int 12} {:str 8, :int 11} {:str 10, :int 10} {:str 9, :int 8} {:str 8, :int 12} {:str 7, :int 9} {:str 13, :int 3} {:str 14, :int 9} {:str 8, :int 9} {:str 10, :int 11} {:str 15, :int 4} {:str 10, :int 11} {:str 8, :int 10} {:str 15, :int 10} {:str 8, :int 13} {:str 12, :int 5} {:str 8, :int 16} {:str 4, :int 8} {:str 10, :int 18} {:str 12, :int 12} {:str 11, :int 10} {:str 12, :int 8} {:str 12, :int 13} {:str 8, :int 12} {:str 9, :int 12} {:str 12, :int 10} {:str 15, :int 10} {:str 8, :int 11} {:str 7, :int 11} {:str 4, :int 8} {:str 12, :int 11} {:str 13, :int 9} {:str 14, :int 13} {:str 5, :int 9} {:str 17, :int 10} {:str 8, :int 13} {:str 9, :int 10} {:str 5, :int 14} {:str 15, :int 12} {:str 13, :int 13} {:str 11, :int 8} {:str 8, :int 6} {:str 12, :int 8} {:str 10, :int 3} {:str 14, :int 9} {:str 15, :int 12} {:str 15, :int 14} {:str 6, :int 10} {:str 16, :int 13}) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; The first thing that our philosophers should look at is the distribution of scores according to their models. ;; What do the results of rolling 3D6 and adding look like? We'll just ;; enumerate the 216 possible ways the dice can fall and count how ;; many ways there are to score the various possible numbers: (def threed6f (frequencies (for [i (range 1 7) j (range 1 7) k (range 1 7)] (reduce + [i j k])))) ;; {3 1, 4 3, 5 6, 6 10, 7 15, 8 21, 9 25, 10 27, 11 27, 12 25, 13 21, 14 15, 15 10, 16 6, 17 3, 18 1} ;; What about 4D6 and discarding the lowest? (def fourd6drop1f (frequencies (for [i (range 1 7) j (range 1 7) k (range 1 7) l (range 1 7)] (reduce + (drop 1 (sort [i j k l])))))) ;; {3 1, 4 4, 5 10, 6 21, 7 38, 8 62, 9 91, 10 122, 11 148, 12 167, 13 172, 14 160, 15 131, 16 94, 17 54, 18 21} ;; So the probability distributions for the dice rolling methods are: (defn p3d6 [char] (/ (threed6f char) (reduce + (vals threed6f)))) (defn p4d6drop1 [char] (/ (fourd6drop1f char) (reduce + (vals fourd6drop1f)))) (defn pmixed [char] (+ (* 9/10 (p3d6 char)) (* 1/10 (p4d6drop1 char)))) ;; Sanity checks! ;; As every schoolboy once knew: ;; You get 3 sixes only one time in ~200, so 18s are very rare (p3d6 18) ;-> 1/216 (p3d6 17) ;-> 1/72 ;; In fact only one roll in ten is in the 15-18 range: (reduce + (map p3d6 '(18 17 16 15))) ;; But you see 10 or 11 very often, in fact about a quarter of the time. (p3d6 10) ;-> 1/8 (p3d6 11) ;-> 1/8 ;; However if you get to roll four dice and ignore one of them, ;; then the high scores are more likely. (p4d6drop1 18) ;-> 7/432 (p4d6drop1 17) ;-> 1/24 (reduce + (map p4d6drop1 '(18 17 16 15))) ;-> 25/108 ;; And in fact you have a quarter chance of an exceptional score 15-18. ;; 10 and 11 are slightly less likely than they were, and 11 is more likely than 10 (p4d6drop1 11) ;-> 37/324 (p4d6drop1 10) ;-> 61/648 ;; Truly bad scores are very hard to get (p4d6drop1 3) ;-> 1/1296 ;; In fact only one in six scores are 'below average'. (reduce + (map p4d6drop1 '(3 4 5 6 7 8 9))) ;-> 227/1296 ;; However in the mixed distribution where you might be doing it one ;; way and you might be doing it the other, these differences are not ;; nearly as large: (pmixed 18) ;-> 5/864 (pmixed 3) ;-> 11/2592 ; A subtle bias indeed. (/ (pmixed 18) (pmixed 3)) ;-> 15/11 ;; Just as a sanity check, check that our three distributions add up to 1. (reduce + (map p4d6drop1 (range 3 19))) ;-> 1N (reduce + (map p3d6 (range 3 19))) ;-> 1N (reduce + (map pmixed (range 3 19))) ;-> 1N ;; Our three schools, with their different beliefs about how characteristics are generated, can now write down the chances ;; of combinations of characteristics: ;; The traditionalists of the first e'dition say that your chance of ;; getting strength 18, intelligence 18 is simply the chance of ;; getting both scores independently. (defn ptrad [{:keys [str int]}] (* (p3d6 int) (p3d6 str))) ;; Being truly gifted is terribly unlikely: (ptrad {:str 18 :int 18}) ;-> 1/46656 ;; As is being truly disadvantaged: (ptrad {:str 3 :int 3}) ;-> 1/46656 ;; But then, only 1 person in 16 is truly average: (reduce + (map ptrad (for [s '(10 11) i '(10 11)] {:str s :int i}))) ;-> 1/16 ;; Being a weedy genius is just as likely as being a genius who can bend swords. (ptrad {:str 3 :int 18}) ;-> 1/46656 ;; The second e'dition guys still model the two scores as independent things, (defn pindep [{:keys [str int]}] (* (pmixed int) (pmixed str))) ;; But they predict more beefy einsteins (pindep {:str 18 :int 18}) ;-> 25/746496 ;; And fewer weedy vegetables (pindep {:str 3 :int 3}) ;-> 121/6718464 ;; One good, one bad (pindep {:str 3 :int 18}) ;-> 55/2239488 ;; Has about the same frequency as in the traditionalists' model (apply / ((juxt pindep ptrad) {:str 3 :int 18})) ;-> 55/48 ;; The third e'dition guys think that you use either the traditional system for both characteristics, ;; or you use the enhanced system for both. They say that there is a common cause, being a 'player character'. ;; And they say that one in ten people are like that. (defn pcommon [{:keys [str int]}] (+ (* 9/10 (p3d6 int) (p3d6 str)) (* 1/10 (p4d6drop1 int) (p4d6drop1 str)))) ;; If we just look at the frequencies of a characteristic in ;; isolation, then we can't tell the difference between the second and ;; third schools. ;; They make the same guesses about the number of very strong people: ;; The chance of having STR 18 is the chance of having ;; STR 18 and any intelligence (reduce + (map pcommon (for [i (range 3 19)] {:str 18 :int i}))) ;-> 5/864 (reduce + (map pindep (for [i (range 3 19)] {:str 18 :int i}))) ;-> 5/864 (reduce + (map ptrad (for [i (range 3 19)] {:str 18 :int i}))) ;-> 1/216 ;; It's only by looking at both scores together that we can see ;; differences between what the world will look like if there's a ;; common cause and what it will look like if there isn't: ;; Both later e'ditions predict more supermen, but the common causers predict even more than the independents: (map #(int (* 1000000 (float %))) ((juxt pcommon ptrad pindep) {:str 18 :int 18})) ;-> (45 21 33) ;; And they both predict slightly fewer basket cases, but the common causers expect less of a drop. (map #(int (* 1000000 (float %))) ((juxt pcommon ptrad pindep) {:str 3 :int 3})) ;-> (19 21 18) ;; The traditionalists predict slightly more cases of people who are good at only one thing (map #(int (* 1000000 (float %))) ((juxt pcommon ptrad pindep) {:str 18 :int 3})) ;-> (20 21 24) (map #(int (* 1000000 (float %))) ((juxt pcommon ptrad pindep) {:str 3 :int 18})) ;-> (20 21 24) ;; Again, we ought to sanity check our distributions: (reduce + (map pcommon (for [i (range 3 19) j (range 3 19)] {:int i :str j}))) ;-> 1N (reduce + (map ptrad (for [i (range 3 19) j (range 3 19)] {:int i :str j}))) ;-> 1N (reduce + (map pindep (for [i (range 3 19) j (range 3 19)] {:int i :str j}))) ;-> 1N ;; Premature optimization is the root of all evil, and there is no ;; place to optimize more premature than an introductory tutorial ;; essay, but I am worried about my transistors wearing out if I have ;; to use these functions a lot. (def pcommon (memoize pcommon)) (def pmixed (memoize pmixed)) (def ptrad (memoize ptrad))

Subscribe to:
Posts (Atom)