See http://forums.nesdev.com/viewtopic.php?f=2&t=11336&fbclid=IwAR3kU9h4klLFp4_8QzsSEXVhicqvjWUWQLqFWR75Q7rqsyQ3WWRP9UYZFOg


Post subject: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 9:50 am 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 29
Location: Canada 	
I have written a number of division routines in 6502 assembly, and I'm
posting them here for other people to use. :)

These routines start with any value (0-255) in the accumulator and finish
with the integer division result in the accumulator. They are all constant
cycles and do not use X or Y. Most do require 1 temp register. I've listed
all divisions from 2 to 32 below, including the trivial divide by powers of
2 cases (for sake of completion).

Code:
; Unsigned Integer Division Routines
; by Omegamatrix


;Divide by 2
;1 byte, 2 cycles
  lsr


;Divide by 3
;18 bytes, 30 cycles
  sta  temp
  lsr
  adc  #21
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr


;Divide by 4
;2 bytes, 4 cycles
  lsr
  lsr


;Divide by 5
;18 bytes, 30 cycles
  sta  temp
  lsr
  adc  #13
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr


;Divide by 6
;17 bytes, 30 cycles
  lsr
  sta  temp
  lsr
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr


;Divide by 7 (From December '84 Apple Assembly Line)
;15 bytes, 27 cycles
  sta  temp
  lsr
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr


;Divide by 8
;3 bytes, 6 cycles
  lsr
  lsr
  lsr


;Divide by 9
;17 bytes, 30 cycles
  sta  temp
  lsr
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr
  lsr


;Divide by 10
;17 bytes, 30 cycles
  lsr
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr


;Divide by 11
;20 bytes, 35 cycles
  sta  temp
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr


;Divide by 12
;17 bytes, 30 cycles
  lsr
  lsr
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr


; Divide by 13
; 21 bytes, 37 cycles
  sta  temp
  lsr
  adc  temp
  ror
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr
  clc
  adc  temp
  ror
  lsr
  lsr
  lsr


;Divide by 14
;1/14 = 1/7 * 1/2
;16 bytes, 29 cycles
  sta  temp
  lsr
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr


;Divide by 15
;14 bytes, 24 cycles
  sta  temp
  lsr
  adc  #4
  lsr
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr


;Divide by 16
;4 bytes, 8 cycles
  lsr
  lsr
  lsr
  lsr


;Divide by 17
;18 bytes, 30 cycles
  sta  temp
  lsr
  adc  temp
  ror
  adc  temp
  ror
  adc  temp
  ror
  adc  #0
  lsr
  lsr
  lsr
  lsr


;Divide by 18 = 1/9 * 1/2
;18 bytes, 32 cycles
  sta  temp
  lsr
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


;Divide by 19
;17 bytes, 30 cycles
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


;Divide by 20
;18 bytes, 32 cycles
  lsr
  lsr
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr


;Divide by 21
;20 bytes, 36 cycles
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


;Divide by 22
;21 bytes, 34 cycles
   lsr
   cmp  #33
   adc  #0
   sta  temp
   lsr
   adc  temp
   ror
   adc  temp
   ror
   lsr
   adc  temp
   ror
   lsr
   lsr
   lsr


;Divide by 23
;19 bytes, 34 cycles
  sta  temp
  lsr
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


;Divide by 24
;15 bytes, 27 cycles
   lsr
   lsr
   lsr
   sta   temp
   lsr
   lsr
   adc   temp
   ror
   lsr
   adc   temp
   ror
   lsr


;Divide by 25
;16 bytes, 29 cycles
  sta  temp
  lsr
  lsr
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


;Divide by 26
;21 bytes, 37 cycles
  lsr
  sta  temp
  lsr
  adc  temp
  ror
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr


;Divide by 27
;15 bytes, 27 cycles
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


;Divide by 28
;14 bytes, 24 cycles
  lsr
  lsr
  sta  temp
  lsr
  adc  #2
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr


;Divide by 29
;20 bytes, 36 cycles
  sta  temp
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


;Divide by 30
;14 bytes, 26 cycles
  sta  temp
  lsr
  lsr
  lsr
  lsr
  sec
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


;Divide by 31
;14 bytes, 26 cycles
  sta  temp
  lsr
  lsr
  lsr
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


;Divide by 32
  lsr
  lsr
  lsr
  lsr
  lsr


Last edited by Omegamatrix on Sun Jun 15, 2014 3:50 pm, edited 4 times in total.

Top 	
 Profile  
 
rainwarrior 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 10:17 am 
Offline
User avatar

Joined: Sun Jan 22, 2012 12:03 pm
Posts: 1654
Location: Canada 	
Cool!


Top 	
 Profile  
 
Kasumi 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 11:44 am 
Offline
User avatar

Joined: Wed Apr 02, 2008 2:09 pm
Posts: 726 	
Didn't want to continue off topic in the other thread, but cool! The latest
divide by 5 was in the atariage thread, but I do wish I had seen more of the
others. Might as well say what I'm using them for. My game detects 49 slope
heights. (flat, and 24 facing downward left, 24 facing downward right) I did
the trig for the angles of those heights to find what percentage of the
speed should be kept. Then I used whatever divide routine was the closest to
what it would really be out of the ones I could find.

1/2 (0.5)
1/4(0.25)
1/8(0.125)
etc are already accounted for obviously.
Then
3/4(.75)
7/8(0.875)
Because you can do the divide, then subtract it from the original. (In fact
I do this for 1/2, because I wanted a speed of 1 to not be scaled to zero.)
You can do 2/5 with divide by 5 asl etc.I could probably get a little closer
with the other divides, but the biggest difference is like 0.04 and I've
already gotten used to the feel of the game as is.

I'm also using the divide by 5 for a bouncing ball that decays in strength.
I looked up the coefficient of restitution for a basketball bouncing on
concrete and it was .88ish. So I tried original-original/10, but that felt
wrong to interact with in game, so now it's original-original/5.


Top 	
 Profile  
 
bogax 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 12:15 pm 
Offline

Joined: Wed Jul 30, 2008 12:03 am
Posts: 30 	
Omegamatrix wrote:
I have written a number of division routines in 6502 assembly, and I'm
posting them here for other people to use. :)

These routines start with any value (0-255) in the accumulator and finish
with the integer division result in the accumulator. They are all constant
cycles and do not use X or Y. Most do require 1 temp register. I've listed
all divisions from 2 to 32 below, including the trivial divide by powers of
2 cases (for sake of completion).


I always thought it was a pity that you didn't keep
(or maybe even look for) routines of less accuracy.
eg if you know your dividend will never be >40
you might get by with less code.


Top 	
 Profile  
 
zzo38 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 12:50 pm 
Offline

Joined: Mon Feb 07, 2011 12:46 pm
Posts: 614 	
Might you add some for calculating modulo as well? As well as 16-bit routines?

_________________
.


Top 	
 Profile  
 
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 1:10 pm 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 29
Location: Canada 	
bogax wrote:
Omegamatrix wrote:
I have written a number of division routines in 6502 assembly, and I'm
posting them here for other people to use. :)

These routines start with any value (0-255) in the accumulator and finish
with the integer division result in the accumulator. They are all constant
cycles and do not use X or Y. Most do require 1 temp register. I've listed
all divisions from 2 to 32 below, including the trivial divide by powers of
2 cases (for sake of completion).


I always thought it was a pity that you didn't keep
(or maybe even look for) routines of less accuracy.
eg if you know your dividend will never be >40
you might get by with less code.

Funny you should mention that, as I was just staring at the routines I
posted and it occurred to me that in some cases I might be able to go
quicker by doing a division up front to reduce the sum, and follow with a
cruder approximation second. As simple as that sound it never occurred to me
when I wrote them.


So... I was immediately able to make five of the routines better. :D I am
going to update the first post but in summary here are the updated routines:

Old:
Code:
;Divide by 6
;1/6 = 1/3 * 1/2
;19 bytes, 32 cycles
  sta  temp
  lsr
  adc  #21
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  lsr

;Divide by 10
;1/10 = 1/5 * 1/2
;19 bytes, 32 cycles
  sta  temp
  lsr
  adc  #13
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr
  lsr

;Divide by 20
;1/20 = 1/5 * 1/4
;20 bytes, 34 cycles
  sta  temp
  lsr
  adc  #13
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr

;Divide by 24
;1/24 = 1/3 * 1/8
;21 bytes, 36 cycles
  sta  temp
  lsr
  adc  #21
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr

;Divide by 26
;1/26 = 1/13 * 1/2
;22 bytes, 39 cycles
  sta  temp
  lsr
  adc  temp
  ror
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr
  clc
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


And new:
Code:
;Divide by 6
;17 bytes, 30 cycles
  lsr
  sta  temp
  lsr
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr

;Divide by 10
;17 bytes, 30 cycles
  lsr
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr

;Divide by 20
;18 bytes, 32 cycles
  lsr
  lsr
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr

;Divide by 24
;15 bytes, 27 cycles
   lsr
   lsr
   lsr
   sta   temp
   lsr
   lsr
   adc   temp
   ror
   lsr
   adc   temp
   ror
   lsr

;Divide by 26
;21 bytes, 37 cycles
  lsr
  sta  temp
  lsr
  adc  temp
  ror
  adc  temp
  ror
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr


I am very happy that a new solution for divide by 10 has been found. :)


Edit - Divide by 12 has also been superseded!
Double Edit - Divide by 28 too!

old:
Code:
;Divide by 12
;1/12 = 1/3 * 1/4
;20 bytes, 34 cycles
  sta  temp
  lsr
  adc  #21
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr

;Divide by 28
;1/28 = 1/7 * 1/4
;17 bytes, 31 cycles
  sta  temp
  lsr
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


new:
Code:
;Divide by 12
;17 bytes, 30 cycles
  lsr
  lsr
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr

;Divide by 28
;14 bytes, 24 cycles
  lsr
  lsr
  sta  temp
  lsr
  adc  #2
  lsr
  lsr
  adc  temp
  ror
  lsr
  lsr


Last edited by Omegamatrix on Sat Jun 14, 2014 2:40 pm, edited 2 times in total.

Top 	
 Profile  
 
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 1:15 pm 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 29
Location: Canada 	
Kasumi wrote:
Didn't want to continue off topic in the other thread, but cool! The latest
divide by 5 was in the atariage thread, but I do wish I had seen more of the
others. Might as well say what I'm using them for. My game detects 49 slope
heights. (flat, and 24 facing downward left, 24 facing downward right) I did
the trig for the angles of those heights to find what percentage of the
speed should be kept. Then I used whatever divide routine was the closest to
what it would really be out of the ones I could find.

1/2 (0.5)
1/4(0.25)
1/8(0.125)
etc are already accounted for obviously.
Then
3/4(.75)
7/8(0.875)
Because you can do the divide, then subtract it from the original. (In fact
I do this for 1/2, because I wanted a speed of 1 to not be scaled to zero.)
You can do 2/5 with divide by 5 asl etc.I could probably get a little closer
with the other divides, but the biggest difference is like 0.04 and I've
already gotten used to the feel of the game as is.

I'm also using the divide by 5 for a bouncing ball that decays in strength.
I looked up the coefficient of restitution for a basketball bouncing on
concrete and it was .88ish. So I tried original-original/10, but that felt
wrong to interact with in game, so now it's original-original/5.

Kasumi, as much as I do love writing these routines, it makes me feel even
more glad to hear from people making use of them! Thank you for sharing and
one day hopefully we will all be able to play your game!!


Jeff


Top 	
 Profile  
 
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 1:27 pm 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 29
Location: Canada 	
zzo38 wrote:
Might you add some for calculating modulo as well? As well as 16-bit
routines?

With modulo, most of the time it should be easy enough to run the division
routine, times the result, and subtract from the original sum.
Multiplication is generally pretty easy... but sometimes can gobble up a lot
of cycles.


I have written a modulo six routine before. I know I posted one on Atariage
a long time ago, but IIRC it wasn't correct and I made one a few months ago
that was. I will look for it. Modulo six is useful for games simulating dice
rolls. I can't remember how good or bad the routine was that I came up with.


For 16-bit routines, I have written one for 10 bit division. It is in my
blog here:

http://atariage.com/forums/blog/563/ent ... ide-by-10/

After finding that new divide by 10 today I know that the 16 bit routine can
also be improved. In general 16 bit division is neither fast in terms of
cycles, and cheap in terms of bytes.


Edit - found my Modulo 6 routine. It looks like I'm just integer dividing by
6, times by 6, and subtracting from the original sum:
Code:
;Mod 6
;28 bytes, 43 cycles
  sta  temp
  lsr
  adc  #21
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  adc  temp
  ror
  and  #$FC
  sta  temp2
  lsr
  adc  temp2
  sbc  temp
  eor  #$FF


Top 	
 Profile  
 
chitselb 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 5:22 pm 
Offline

Joined: Sat Jun 14, 2014 2:14 pm
Posts: 6 	
These are great! I needed a divide by 40 and modulo 40 for a
http://en.wikipedia.org/wiki/DEC_Radix-50 decoder in a game I'm writing for
the Commodore PET 2001( http://pettil.tumblr.com ) and the divide by 10 here
looks like a good start. Thank you!


Top 	
 Profile  
 
psycopathicteen 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 5:30 pm 
Offline

Joined: Wed May 19, 2010 6:12 pm
Posts: 771 	
How does this work?


Top 	
 Profile  
 
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 9:41 pm 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 29
Location: Canada 	
chitselb wrote:
These are great! I needed a divide by 40 and modulo 40 for a
http://en.wikipedia.org/wiki/DEC_Radix-50 decoder in a game I'm writing for
the Commodore PET 2001( http://pettil.tumblr.com ) and the divide by 10 here
looks like a good start. Thank you!

The good thing here is that the 6502 was used by so many systems. :)

As you know, once you have divided by 10 then you are just two shifts away
from a divide by 40.

Code:
;Divide by 40
;19 bytes, 34 cycles
  lsr
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror 
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


But here is an alternate code. It costs 1 more byte, but saves two cycles...
so use whichever one bests suits you.
Code:
;Divide by 40
;20 bytes, 32 cycles
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  adc  #1
  adc  temp
  and  #$C0
  rol
  rol
  rol


And a modulo routine:
Code:
;Mod 40
  sta  temp
  lsr
  adc  #13
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  and  #$E0
  sta  temp2  ;x32
  lsr
  lsr         ;x8
  adc  temp2  ;x40
  sbc  temp
  eor  #$FF


Or both:
Code:
  sta  temp
  lsr
  adc  temp
  ror
  lsr
  lsr
  adc  temp
  ror
  adc  #1
  adc  temp
  and  #$C0
  rol
  rol
  rol

  sta  divideResult   ; divide 40 result... TAY, TAX, PHA could be used

  asl
  asl
  asl
  sta  temp2  ;x8
  asl
  asl         ;x32
  adc  temp2  ;x40
  sbc  temp
  eor  #$FF
                      ; mod 40 result


Top 	
 Profile  
 
tepples 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 10:01 pm 
Offline

Joined: Sun Sep 19, 2004 11:12 pm
Posts: 12555
Location: NE Indiana, USA (NTSC) 	
If you're dividing by 10 to make a binary to decimal converter, there are
ways to do that other than div/mod.


Top 	
 Profile  
 
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sat Jun 14, 2014 10:09 pm 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 29
Location: Canada 	
psycopathicteen wrote:
How does this work?

You are dividing and adding that result to your original sum, and then
repeating the process. The key is find the correct and shortest sequence of
divisions you have to do.

I had a simple idea for finding the divisions. I used a brute attack
approach running every possible combination in an excel sheet. I only tested
for 1/x and ignored truncation... I just looked for division combinations
that were the most accurate under perfect circumstances. Those were the
routines I focused on testing

From there I built a bunch of tools. I made another excel sheet to test
division for all input values (0-255), some C programs to do the same thing
as both excel sheets, a rom for the 2600 to test for verification and help
solve correction factors for some routines. Whenever you see ADC #xx in one
of the routines you are looking at a correction factor I had to introduce.
Sometimes I found I could also use just CLC or SEC at certain points in the
routine to do the correction.


The usage of the routines is easy. The accumulator starts with the value to
be divided, and ends with the integer division result.


Top 	
 Profile  
 
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sun Jun 15, 2014 12:12 am 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 29
Location: Canada 	
Final thoughts on divide by 40 before I go to bed. This is such a big number
that you are approaching the limit where it is simply better to do a looped
divide... if you don't mind the extra cycles and spending X or Y.


I had another idea, this time using both X and Y. This routine takes 3 more
bytes then the combined Div 40 and Mod 40 routine, but it is faster (45
cycles vs 56). It does a comparison and add to get the integer divide. After
that I times the result by 4, as I'm reusing the bytes in the comparison as
a look up table. The values are already there and nicely spread apart by 4
bytes so why not? I had to add a dummy load at the beginning to get first
look up value, that's why the extra LDA #0 is in there.


Code:
;Divide by and Mod 40 combined
;38 bytes, 45 cycles
;Y = value to be divided

InterlacedMultiplyByFortyTable:
  lda  #0        ; dummy load, #0 used in LUT
  lda  #0
  cpy  #40
  adc  #0
  cpy  #80
  adc  #0
  cpy  #120
  adc  #0
  cpy  #160
  adc  #0
  cpy  #200
  adc  #0
  cpy  #240
  adc  #0
 
  sta  divideResult      ; Integer divide 40
 
  asl
  asl
  tax
  tya
  sec
  sbc  InterlacedMultiplyByFortyTable+1,X
                         ; A = Mod 40


Right away I look at this routine and think of putting in some branching to
save bytes, but that would destroy the look-up table that's built in. It's a
neat if not convoluted routine on its own... I'm just not to sure if it is
really useful, but I like to be creative. :lol:


Top 	
 Profile  
 
Bregalad 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Sun Jun 15, 2014 2:48 am 
Offline
User avatar

Joined: Fri Nov 12, 2004 2:49 pm
Posts: 6062
Location: Jongny, VD, Switzerland 	
The most useful in the case of the NES is probably divide-by-15, as a
nametable can show 15 2x2 metatiles vertically, so you'll need to divide by
15 before knowing where to write your metatile when scrolling vertically.
(yes I know there are workaround this, but still this is one of the ways to
do this).

As for divide by power of two, you don't have to remind us they're shifts, I
think everyone already knowns.


Post subject: Re: Unsigned Integer Division Routines
Posted: Sun Jun 15, 2014 3:54 pm 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 29
Location: Canada 	
I've updated the divide by 22 with a quicker routine. Same amount of bytes as before.

Old:
Code:
;Divide by 22
;1/22 = 1/11 * 1/2
;21 bytes, 37 cycles
  sta  temp
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


New:
Code:
;Divide by 22
;21 bytes, 34 cycles
   lsr
   cmp  #33
   adc  #0
   sta  temp
   lsr
   adc  temp
   ror
   adc  temp
   ror
   lsr
   adc  temp
   ror
   lsr
   lsr
   lsr


Top 	
 Profile  
 
chitselb 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Tue Jul 01, 2014 2:08 am 
Offline

Joined: Sat Jun 14, 2014 2:14 pm
Posts: 6 	
The idea behind Rad50 http://en.wikipedia.org/wiki/RADIX-50 is that 40x40x40
= 64000, so if you limit your character set to 40 characters (26 letters +
10 digits + 4 whatevers) you can squeeze three bytes of text into a two byte
unsigned, every time. 33% compression.

In these routines, the divisor is 8-bit. Scaling up to 16-bit e.g. LSR
becomes "LSR x+1; ROR x" introduces inaccuracies once the divisor is greater
than 256. 28741 /10 = 2862


Top 	
 Profile  
 
chitselb 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Tue Jul 01, 2014 4:42 am 
Offline

Joined: Sat Jun 14, 2014 2:14 pm
Posts: 6 	
I played around with it in a spreadsheet, and I noticed this. It's 32-bit
math, but it doesn't have to be an obscene amount of 6502 code

(x*13107+13100)/65536 = x/5

and 13107 = 3 + 48 + 192 + 12288 (triple it; add and shift 4; add and shift
4; add and shift 4; add (and shift 4))

Here it is in C

Code:
#include <stdio.h>
int main(void)
{
   int z=0;
   int i;
   for (i=0; i<64000; i++) {
      int t=0;
      int a=i*3;
      int j;
      for (j=0; j<4; j++) {
         t += a;
         a *= 16;
      }
      t += 13100;  /* a fudge factor */
      int x=(t/65536)>>3;
      int y=(i/40);
      z += abs(x-y);
      printf("%6d  %6d  %6d  %6d\n",i,x,y,z);
   }
}


Top 	
 Profile  
 
tepples 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Tue Jul 01, 2014 8:05 am 
Offline

Joined: Sun Sep 19, 2004 11:12 pm
Posts: 12555
Location: NE Indiana, USA (NTSC) 	
In practice, I wonder whether base 32 might be quicker to decode and nearly
as effective at compression. See ITA2 and Z-character.


Top 	
 Profile  
 
doynax 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Tue Jul 01, 2014 4:39 pm 
Offline

Joined: Mon Nov 22, 2004 3:24 pm
Posts: 158
Location: Sweden 	
Omegamatrix wrote:
I had a simple idea for finding the divisions. I used a brute attack
approach running every possible combination in an excel sheet. I only tested
for 1/x and ignored truncation... I just looked for division combinations
that were the most accurate under perfect circumstances. Those were the
routines I focused on testing

From there I built a bunch of tools. I made another excel sheet to test
division for all input values (0-255), some C programs to do the same thing
as both excel sheets, a rom for the 2600 to test for verification and help
solve correction factors for some routines. Whenever you see ADC #xx in one
of the routines you are looking at a correction factor I had to introduce.
Sometimes I found I could also use just CLC or SEC at certain points in the
routine to do the correction. Clever. I think I'll keep these routines handy
for future needs.

I don't suppose you've considered rigging up a super-optimizer and letting
it loose on the problem? It seems like a relatively limited number of
permutations to search through, though I suppose the immediate fudge factor
might cause trouble.

I know I've occasionally wished that someone would take the time to write a
good one for the 6502. Ideally with a flexible scheme for evaluating the
results, a flexible opcode/immediate search space and specifying other
constraints.

It just seems like 6502 optimization in practice as often as not boils down
to a puzzle of the form: shuffle bit X into register Y at precisely cycle Z
while preserving carry. What remains is then a brain-dead task of working
through the possible permutations until you find the best one.


Top 	
 Profile  
 
chitselb 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Tue Jul 01, 2014 9:11 pm 
Offline

Joined: Sat Jun 14, 2014 2:14 pm
Posts: 6 	
Here's what I came up with. Probably not optimal, and at 179 bytes, way more
code than I want. It's a Forth word in http://pettil.tumblr.com if you're
wondering what is all that weird stuff on the fringe of the code


Code:
;--------------------------------------------------------------
#if 0
name=40/MOD
stack=( u -- u%40 u/40 )
tags=math
Perform a divide by 40 and a modulo 40, for
[[Radix50|http://en.wikipedia.org/wiki/DEC_Radix-50]]
#endif
slmod40
    stx storex
    lda #0
    ldx #7
slmod40a
    sta n,x
    dex
    bpl slmod40a                ; zero n+0..n+7
    ldx #2
slmod40b
    clc
    lda n                       ; addend = n*3
    adc tos
    sta n
    lda n+1
    adc tos+1
    sta n+1
    bcc slmod40c
    inc n+2
slmod40c
    dex
    bpl slmod40b
    lda #4
    sta n+8
slmod40d
    clc
    lda n+0
    adc n+4
    sta n+4
    lda n+1
    adc n+5
    sta n+5
    lda n+2
    adc n+6
    sta n+6
    lda n+3
    adc n+7
    sta n+7                     ; sum += addend

    ldy #4
slmod40e
    asl n
    rol n+1
    rol n+2
    rol n+3                     ; addend << 4
    dey
    bne slmod40e
    dec n+8                     ; repeat 4x (3+48+768+12288 = 13107)
    bne slmod40d

    clc
    lda n+4
    adc #<13100
    sta n+4
    lda n+5
    adc #>13100
    sta n+5
    bcc slmod40f
    inc n+6
    bne slmod40f
    inc n+7                     ; sum += 13100 (fudge factor)
slmod40f
    lda n+7                     ; (n+6) is now u/5
    lsr
    ror n+6
    lsr
    ror n+6
    lsr
    ror n+6
    sta n+7                     ; (n+6) = u/40

    lda n+6
    sta n+0
    lda n+7
    sta n+1
    sty n+2
    sty n+3
    asl n
    rol n+1
    asl n
    rol n+1                     ; n = u/40*4
    ;clc
    lda n
    adc n+6
    sta n
    lda n+1
    adc n+7
    sta n+1                     ; n = u/40*5
    asl n
    rol n+1
    asl n
    rol n+1
    asl n
    rol n+1                     ; n = u/40*5*8
    sec
    lda tos
    sbc n
    sta tos
    lda tos+1
    sbc n+1
    sta tos+1                   ; tos =  u % 40
    lda n+6
    ldy n+7                     ; push   u / 40
    ldx storex
    jmp pushya


Top 	
 Profile  
 
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Tue Jul 01, 2014 9:28 pm 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 29
Location: Canada 	
doynax wrote:
Clever. I think I'll keep these routines handy for future needs.

I don't suppose you've considered rigging up a super-optimizer and letting
it loose on the problem? It seems like a relatively limited number of
permutations to search through, though I suppose the immediate fudge factor
might cause trouble.

I know I've occasionally wished that someone would take the time to write a
good one for the 6502. Ideally with a flexible scheme for evaluating the
results, a flexible opcode/immediate search space and specifying other
constraints.

It just seems like 6502 optimization in practice as often as not boils down
to a puzzle of the form: shuffle bit X into register Y at precisely cycle Z
while preserving carry. What remains is then a brain-dead task of working
through the possible permutations until you find the best one.

Thanks! It's always nice to hear people might make use of these routines. I
wrote them because to me they are like solving little puzzles.

The idea came up before on AtariAge about letting a super computer mull over
every usable opcode. Who knows what weird combination of opcodes would bring
a faster, shorter solution? Nothing came of that idea though... I suppose
though that just a small pool of opcodes might be workable on a home
computer. Say just using STA, ASL, LSR, ROR, ROL, CLC, SEC, ADC zeropage,
ADC immediate, SBC zeropage, sbc immediate, EOR zeropage, EOR immediate. I
would propose constraining the routine to 36 bytes or less, and 36 cycles or
less because most of the routines I wrote are already shorter than that. I
would also constrain it to no more then two temp registers, and preferably
just one. If the program could crunch all that then I would try adding in
ROL, ROR, ASL, and LSR (all zeropage) next.


Top 	
 Profile  
 
doynax 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Wed Jul 02, 2014 3:17 am 
Offline

Joined: Mon Nov 22, 2004 3:24 pm
Posts: 158
Location: Sweden 	
Omegamatrix wrote:
The idea came up before on AtariAge about letting a super computer mull over
every usable opcode. Who knows what weird combination of opcodes would bring
a faster, shorter solution? Nothing came of that idea though... I suppose
though that just a small pool of opcodes might be workable on a home
computer. Say just using STA, ASL, LSR, ROR, ROL, CLC, SEC, ADC zeropage,
ADC immediate, SBC zeropage, sbc immediate, EOR zeropage, EOR immediate. I
would propose constraining the routine to 36 bytes or less, and 36 cycles or
less because most of the routines I wrote are already shorter than that. I
would also constrain it to no more then two temp registers, and preferably
just one. If the program could crunch all that then I would try adding in
ROL, ROR, ASL, and LSR (all zeropage) next.

Such devices have been used with some degree of success in the past. Mostly
to find short bit-hacks abusing the architecture in some novel way or
another. Massalin's original paper is a good read, and there is a PIC16
implementation for instance.

There are other other strategies of course but the brute-force method of
just generating all permutations from a limited set is workable. They key is
to keep the number of opcodes low (counting each immediate and temporary as
a distinct instruction) and while for a general-purpose optimize this may
not be feasible I'd be more interested in a guided search automate the sort
of thing you've been doing by hand here. For instance in the case of
division you might first search for roughly the right answer then add in the
fudge-factor separately.

Note that virtually all generated sequences are completely wrong, so
validating them is easy with a couple of well-picked counterexamples. A
full-blown theorem prover is only necessary for confirming the final result,
if at all.

Realistically with, say, 25 opcodes, a bit of pruning and a fast search
function you might be able to reach around 12 instructions on a small
cluster running overnight.


Top 	
 Profile  
 
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Thu Jul 03, 2014 7:23 am 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 29
Location: Canada 	
I glanced at Massalin's paper and that looks like an interesting solution
they found for BCD. I will have to read it more later when I have time.


I thought more about the computation today. It would be nice to also include
ORA and AND, both zeropage and immediate. It's all the immediates that
explode the number of states. ADC, SBC, and EOR immediate take 256 states.
ORA and AND only need 255 states as you can skip ORA #0 and AND #$FF.

Code:
;opcode     states
; rol         1
; lsr         1
; asl         1
; ror         1
; clc         1
; sec         1
; adc temp    1
; sbc temp    1
; eor temp    1
; and temp    1
; ora temp    1
; adc #imm   256
; sbc #imm   256
; eor #imm   256
; and #imm   255
; ora #imm   255


That totals 1289 states. To do 12 instructions would be 1289^12 = 2.1 x
10^37 routines... so pretty heavy. If we skipped all of the immediates then
we have a small pool of 11 different states. 11^12 = 2.85 x 10^11 routines,
which is doable. I'd much rather run through all the routines and resolve
all possible fudge factors at once, but that is some serious computation
power needed.


For a good initial test value, 255, 238, and 239 are all very good to use. I
did a quick excel sheet for calculating the number of unique results for
each input (0-255) when dividing by 3 to 31 (skipping divide by 2, 4, 8, and
16). The rightmost column displays the number of unique values for that
input.

Attachment:
TestValue.jpg
TestValue.jpg [ 236.38 KiB | Viewed 269 times ]


238 and 239 give the same results for division. More importantly 238 and 255
give unique results for division 3 to 19. This makes it easy to test for
initial correctness of the routine with just two checks.


Top 	
 Profile  
 
tepples 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Fri Jul 04, 2014 9:55 am 
Offline

Joined: Sun Sep 19, 2004 11:12 pm
Posts: 12555
Location: NE Indiana, USA (NTSC) 	
I found another fudge if you're allowed to change the encoder. Have the
encoder multiply each 16-bit word by 65536/64000 = 128/125 = 1.024, so that
the decoder works by multiplying by 40 (shift, shift, add, shift, shift)
instead of dividing by 40. The principle is that of arithmetic coding.


Top 	
 Profile  
 
chitselb 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Tue Jul 08, 2014 7:30 pm 
Offline

Joined: Sat Jun 14, 2014 2:14 pm
Posts: 6 	
tepples wrote:
I found another fudge if you're allowed to change the encoder. Have the
encoder multiply each 16-bit word by 65536/64000 = 128/125 = 1.024, so that
the decoder works by multiplying by 40 (shift, shift, add, shift, shift)
instead of dividing by 40. The principle is that of arithmetic coding.


The encoder will run on a non-6502 computer, so a faster decoder would be
wonderful. I stared at this for a bit and couldn't figure out how it works.
Given "DOG" which encodes to 4, 15, 7, or 4*1600+15*40+7 = 7007, I multiply
that *1.024 and wind up with 7175.168. I don't have the luxury of storing
the fractional portion, so iteratively multiplying 7175*40 and dividing by
65536 results in 4, 15, 6.8359375

I see where you're going with it, but how does it translate into an
algorithm that yields decodable output?


Top 	
 Profile  
 
tepples 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Tue Jul 08, 2014 7:52 pm 
Offline

Joined: Sun Sep 19, 2004 11:12 pm
Posts: 12555
Location: NE Indiana, USA (NTSC) 	
Try rounding the multiplication by 1.024 up using ceil().


Top 	
 Profile  
 
chitselb 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Wed Jul 09, 2014 7:12 am 
Offline

Joined: Sat Jun 14, 2014 2:14 pm
Posts: 6 	
tepples wrote:
Try rounding the multiplication by 1.024 up using ceil().


I played around with it in a spreadsheet and there are inaccuracies. It
would probaby work if more bits were available for the fractions, but in
Rad50 they just aren't there.


Top 	
 Profile  
 
lidnariq 	
 Post subject: Re: Unsigned Integer Division Routines
Posted: Wed Jul 09, 2014 2:01 pm 
Offline

Joined: Sun Apr 13, 2008 11:12 am
Posts: 2411
Location: seattle 	
Worked for me:
Code:
#!/usr/bin/perl
use POSIX 'ceil';
for ($x = 0; $x < 40; $x++) {
   for ($y = 0; $y < 40; $y++) {
      for ($z = 0; $z < 40; $z++) {
         my $sum = $x*1600 + $y*40 + $z;
         $sum = ceil($sum * 1.024);

         $sum *= 40; $a = $sum >> 16; $sum &= 0xFFFF;
         $sum *= 40; $b = $sum >> 16; $sum &= 0xFFFF;
         $c = ($sum * 40) >> 16;
         if ($x != $a or $y != $b or $z != $c) {
            printf "%2d=%2d %2d=%2d %2d=%2d\n",$x,$a,$y,$b,$z,$c;
         }
      }
   }
}


Post subject: Re: Unsigned Integer Division Routines
Posted: Wed Jul 09, 2014 2:34 pm 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 29
Location: Canada 	
chitselb are you using 16 bit? I do have a 16 bit divide by 10 routine that
you could tweak to get a div 40 and mod 40. That being said if you can just
use multiply 40 then you probably way better off.


Code:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;    UNSIGNED DIVIDE BY 10 (16 BIT)
;    By Omegamatrix
;    126 cycles (max), 79 bytes
;
;    Start with 16 bit value in counterHigh, counterLow
;    End with 16 bit result in highTen, lowTen
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

TensRemaining:
    .byte 0,25,51,76,102,128,153,179,204,230
ModRemaing:
    .byte 0,6,2,8,4,0,6,2,8,4

.startDivideBy10:
    ldy    #-2                   ;2   @2   skips a branch the first time through
    lda    counterHigh           ;3   @5
.do8bitDiv:
    sta    temp                  ;3   @8
    lsr                          ;2   @10
    adc    #13                   ;2   @12
    adc    temp                  ;3   @15
    ror                          ;2   @17
    lsr                          ;2   @19
    lsr                          ;2   @21
    adc    temp                  ;3   @24
    ror                          ;2   @26
    adc    temp                  ;3   @29
    ror                          ;2   @31
    lsr                          ;2   @33
    and    #$7C                  ;2   @35   AND'ing here...
    sta    temp                  ;3   @38   and saving result as highTen (times 4)
    lsr                          ;2   @40
    lsr                          ;2   @42
    iny                          ;2   @44
    bpl    .finishLowTen         ;2/3 @46/47...120
                                      
    sta    highTen               ;3   @49
    adc    temp                  ;3   @52   highTen (times 5)
    asl                          ;2   @54   highTen (times 10)
    sbc    counterHigh           ;3   @57
    eor    #$FF                  ;2   @59
    tay                          ;2   @61   mod 10 result!
                                      
    lda    TensRemaining,Y       ;4   @65   Fill the low byte with the tens it should
    sta    lowTen                ;3   @68   have at this point from the high byte divide.
    lda    counterLow            ;3   @71
    adc    ModRemaing,Y          ;4   @75
    bcc    .do8bitDiv            ;2/3 @77/78

.overflowFound:
    cmp    #4                    ;2   @79   We have overflowed, but we can apply a shortcut.
    lda    #25                   ;2   @81   Divide by 10 will be at least 25, and the
                                 ;          carry is set when higher for the next addition.
.finishLowTen:
    adc    lowTen                ;3   @123
    sta    lowTen                ;3   @126  routine ends at either 87 or 126 cycles


Post new topic Reply to topic 	 Page 2 of 3
	 [ 31 posts ]  	Go to page Previous  1, 2, 3  Next
  Print view 	Previous topic | Next topic 
Author 	Message
tepples 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Sun Jun 15, 2014 9:19 am 
Offline

Joined: Sun Sep 19, 2004 11:12 pm
Posts: 20704
Location: NE Indiana, USA (NTSC) 	
Unless, of course, you use tokumaru's solution of storing world-relative and nametable-relative camera positions separately.


Top 	
 Profile  
 
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Sun Jun 15, 2014 3:54 pm 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 35
Location: Canada 	
I've updated the divide by 22 with a quicker routine. Same amount of bytes as before.

Old:
Code:
;Divide by 22
;1/22 = 1/11 * 1/2
;21 bytes, 37 cycles
  sta  temp
  lsr
  lsr
  adc  temp
  ror
  adc  temp
  ror
  adc  temp
  ror
  lsr
  adc  temp
  ror
  lsr
  lsr
  lsr
  lsr


New:
Code:
;Divide by 22
;21 bytes, 34 cycles
   lsr
   cmp  #33
   adc  #0
   sta  temp
   lsr
   adc  temp
   ror
   adc  temp
   ror
   lsr
   adc  temp
   ror
   lsr
   lsr
   lsr


Top 	
 Profile  
 
chitselb 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Tue Jul 01, 2014 2:08 am 
Offline

Joined: Sat Jun 14, 2014 2:14 pm
Posts: 6 	
The idea behind Rad50 http://en.wikipedia.org/wiki/RADIX-50 is that 40x40x40 = 64000, so if you limit your character set to 40 characters (26 letters + 10 digits + 4 whatevers) you can squeeze three bytes of text into a two byte unsigned, every time. 33% compression.

In these routines, the divisor is 8-bit. Scaling up to 16-bit e.g. LSR becomes "LSR x+1; ROR x" introduces inaccuracies once the divisor is greater than 256. 28741 /10 = 2862


Top 	
 Profile  
 
chitselb 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Tue Jul 01, 2014 4:42 am 
Offline

Joined: Sat Jun 14, 2014 2:14 pm
Posts: 6 	
I played around with it in a spreadsheet, and I noticed this. It's 32-bit math, but it doesn't have to be an obscene amount of 6502 code

(x*13107+13100)/65536 = x/5

and 13107 = 3 + 48 + 192 + 12288 (triple it; add and shift 4; add and shift 4; add and shift 4; add (and shift 4))

Here it is in C

Code:
#include <stdio.h>
int main(void)
{
   int z=0;
   int i;
   for (i=0; i<64000; i++) {
      int t=0;
      int a=i*3;
      int j;
      for (j=0; j<4; j++) {
         t += a;
         a *= 16;
      }
      t += 13100;  /* a fudge factor */
      int x=(t/65536)>>3;
      int y=(i/40);
      z += abs(x-y);
      printf("%6d  %6d  %6d  %6d\n",i,x,y,z);
   }
}


Top 	
 Profile  
 
tepples 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Tue Jul 01, 2014 8:05 am 
Offline

Joined: Sun Sep 19, 2004 11:12 pm
Posts: 20704
Location: NE Indiana, USA (NTSC) 	
In practice, I wonder whether base 32 might be quicker to decode and nearly as effective at compression. See ITA2 and Z-character.


Top 	
 Profile  
 
doynax 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Tue Jul 01, 2014 4:39 pm 
Offline

Joined: Mon Nov 22, 2004 3:24 pm
Posts: 162
Location: Sweden 	
Omegamatrix wrote:
I had a simple idea for finding the divisions. I used a brute attack approach running every possible combination in an excel sheet. I only tested for 1/x and ignored truncation... I just looked for division combinations that were the most accurate under perfect circumstances. Those were the routines I focused on testing

From there I built a bunch of tools. I made another excel sheet to test division for all input values (0-255), some C programs to do the same thing as both excel sheets, a rom for the 2600 to test for verification and help solve correction factors for some routines. Whenever you see ADC #xx in one of the routines you are looking at a correction factor I had to introduce. Sometimes I found I could also use just CLC or SEC at certain points in the routine to do the correction.
Clever. I think I'll keep these routines handy for future needs.

I don't suppose you've considered rigging up a super-optimizer and letting it loose on the problem? It seems like a relatively limited number of permutations to search through, though I suppose the immediate fudge factor might cause trouble.

I know I've occasionally wished that someone would take the time to write a good one for the 6502. Ideally with a flexible scheme for evaluating the results, a flexible opcode/immediate search space and specifying other constraints.

It just seems like 6502 optimization in practice as often as not boils down to a puzzle of the form: shuffle bit X into register Y at precisely cycle Z while preserving carry. What remains is then a brain-dead task of working through the possible permutations until you find the best one.


Top 	
 Profile  
 
chitselb 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Tue Jul 01, 2014 9:11 pm 
Offline

Joined: Sat Jun 14, 2014 2:14 pm
Posts: 6 	
Here's what I came up with. Probably not optimal, and at 179 bytes, way more code than I want. It's a Forth word in http://pettil.tumblr.com if you're wondering what is all that weird stuff on the fringe of the code


Code:
;--------------------------------------------------------------
#if 0
name=40/MOD
stack=( u -- u%40 u/40 )
tags=math
Perform a divide by 40 and a modulo 40, for
[[Radix50|http://en.wikipedia.org/wiki/DEC_Radix-50]]
#endif
slmod40
    stx storex
    lda #0
    ldx #7
slmod40a
    sta n,x
    dex
    bpl slmod40a                ; zero n+0..n+7
    ldx #2
slmod40b
    clc
    lda n                       ; addend = n*3
    adc tos
    sta n
    lda n+1
    adc tos+1
    sta n+1
    bcc slmod40c
    inc n+2
slmod40c
    dex
    bpl slmod40b
    lda #4
    sta n+8
slmod40d
    clc
    lda n+0
    adc n+4
    sta n+4
    lda n+1
    adc n+5
    sta n+5
    lda n+2
    adc n+6
    sta n+6
    lda n+3
    adc n+7
    sta n+7                     ; sum += addend

    ldy #4
slmod40e
    asl n
    rol n+1
    rol n+2
    rol n+3                     ; addend << 4
    dey
    bne slmod40e
    dec n+8                     ; repeat 4x (3+48+768+12288 = 13107)
    bne slmod40d

    clc
    lda n+4
    adc #<13100
    sta n+4
    lda n+5
    adc #>13100
    sta n+5
    bcc slmod40f
    inc n+6
    bne slmod40f
    inc n+7                     ; sum += 13100 (fudge factor)
slmod40f
    lda n+7                     ; (n+6) is now u/5
    lsr
    ror n+6
    lsr
    ror n+6
    lsr
    ror n+6
    sta n+7                     ; (n+6) = u/40

    lda n+6
    sta n+0
    lda n+7
    sta n+1
    sty n+2
    sty n+3
    asl n
    rol n+1
    asl n
    rol n+1                     ; n = u/40*4
    ;clc
    lda n
    adc n+6
    sta n
    lda n+1
    adc n+7
    sta n+1                     ; n = u/40*5
    asl n
    rol n+1
    asl n
    rol n+1
    asl n
    rol n+1                     ; n = u/40*5*8
    sec
    lda tos
    sbc n
    sta tos
    lda tos+1
    sbc n+1
    sta tos+1                   ; tos =  u % 40
    lda n+6
    ldy n+7                     ; push   u / 40
    ldx storex
    jmp pushya


Top 	
 Profile  
 
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Tue Jul 01, 2014 9:28 pm 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 35
Location: Canada 	
doynax wrote:
Clever. I think I'll keep these routines handy for future needs.

I don't suppose you've considered rigging up a super-optimizer and letting it loose on the problem? It seems like a relatively limited number of permutations to search through, though I suppose the immediate fudge factor might cause trouble.

I know I've occasionally wished that someone would take the time to write a good one for the 6502. Ideally with a flexible scheme for evaluating the results, a flexible opcode/immediate search space and specifying other constraints.

It just seems like 6502 optimization in practice as often as not boils down to a puzzle of the form: shuffle bit X into register Y at precisely cycle Z while preserving carry. What remains is then a brain-dead task of working through the possible permutations until you find the best one.

Thanks! It's always nice to hear people might make use of these routines. I wrote them because to me they are like solving little puzzles.

The idea came up before on AtariAge about letting a super computer mull over every usable opcode. Who knows what weird combination of opcodes would bring a faster, shorter solution? Nothing came of that idea though... I suppose though that just a small pool of opcodes might be workable on a home computer. Say just using STA, ASL, LSR, ROR, ROL, CLC, SEC, ADC zeropage, ADC immediate, SBC zeropage, sbc immediate, EOR zeropage, EOR immediate. I would propose constraining the routine to 36 bytes or less, and 36 cycles or less because most of the routines I wrote are already shorter than that. I would also constrain it to no more then two temp registers, and preferably just one. If the program could crunch all that then I would try adding in ROL, ROR, ASL, and LSR (all zeropage) next.


Top 	
 Profile  
 
doynax 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Wed Jul 02, 2014 3:17 am 
Offline

Joined: Mon Nov 22, 2004 3:24 pm
Posts: 162
Location: Sweden 	
Omegamatrix wrote:
The idea came up before on AtariAge about letting a super computer mull over every usable opcode. Who knows what weird combination of opcodes would bring a faster, shorter solution? Nothing came of that idea though... I suppose though that just a small pool of opcodes might be workable on a home computer. Say just using STA, ASL, LSR, ROR, ROL, CLC, SEC, ADC zeropage, ADC immediate, SBC zeropage, sbc immediate, EOR zeropage, EOR immediate. I would propose constraining the routine to 36 bytes or less, and 36 cycles or less because most of the routines I wrote are already shorter than that. I would also constrain it to no more then two temp registers, and preferably just one. If the program could crunch all that then I would try adding in ROL, ROR, ASL, and LSR (all zeropage) next.

Such devices have been used with some degree of success in the past. Mostly to find short bit-hacks abusing the architecture in some novel way or another. Massalin's original paper is a good read, and there is a PIC16 implementation for instance.

There are other other strategies of course but the brute-force method of just generating all permutations from a limited set is workable. They key is to keep the number of opcodes low (counting each immediate and temporary as a distinct instruction) and while for a general-purpose optimize this may not be feasible I'd be more interested in a guided search automate the sort of thing you've been doing by hand here. For instance in the case of division you might first search for roughly the right answer then add in the fudge-factor separately.
Note that virtually all generated sequences are completely wrong, so validating them is easy with a couple of well-picked counterexamples. A full-blown theorem prover is only necessary for confirming the final result, if at all.

Realistically with, say, 25 opcodes, a bit of pruning and a fast search function you might be able to reach around 12 instructions on a small cluster running overnight.


Top 	
 Profile  
 
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Thu Jul 03, 2014 7:23 am 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 35
Location: Canada 	
I glanced at Massalin's paper and that looks like an interesting solution they found for BCD. I will have to read it more later when I have time.


I thought more about the computation today. It would be nice to also include ORA and AND, both zeropage and immediate. It's all the immediates that explode the number of states. ADC, SBC, and EOR immediate take 256 states. ORA and AND only need 255 states as you can skip ORA #0 and AND #$FF.

Code:
;opcode     states
; rol         1
; lsr         1
; asl         1
; ror         1
; clc         1
; sec         1
; adc temp    1
; sbc temp    1
; eor temp    1
; and temp    1
; ora temp    1
; adc #imm   256
; sbc #imm   256
; eor #imm   256
; and #imm   255
; ora #imm   255


That totals 1289 states. To do 12 instructions would be 1289^12 = 2.1 x 10^37 routines... so pretty heavy. If we skipped all of the immediates then we have a small pool of 11 different states. 11^12 = 2.85 x 10^11 routines, which is doable. I'd much rather run through all the routines and resolve all possible fudge factors at once, but that is some serious computation power needed.


For a good initial test value, 255, 238, and 239 are all very good to use. I did a quick excel sheet for calculating the number of unique results for each input (0-255) when dividing by 3 to 31 (skipping divide by 2, 4, 8, and 16). The rightmost column displays the number of unique values for that input.

Attachment:
TestValue.jpg
TestValue.jpg [ 236.38 KiB | Viewed 3215 times ]


238 and 239 give the same results for division. More importantly 238 and 255 give unique results for division 3 to 19. This makes it easy to test for initial correctness of the routine with just two checks.


Top 	
 Profile  
 
tepples 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Fri Jul 04, 2014 9:55 am 
Offline

Joined: Sun Sep 19, 2004 11:12 pm
Posts: 20704
Location: NE Indiana, USA (NTSC) 	
I found another fudge if you're allowed to change the encoder. Have the encoder multiply each 16-bit word by 65536/64000 = 128/125 = 1.024, so that the decoder works by multiplying by 40 (shift, shift, add, shift, shift) instead of dividing by 40. The principle is that of arithmetic coding.


Top 	
 Profile  
 
chitselb 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Tue Jul 08, 2014 7:30 pm 
Offline

Joined: Sat Jun 14, 2014 2:14 pm
Posts: 6 	
tepples wrote:
I found another fudge if you're allowed to change the encoder. Have the encoder multiply each 16-bit word by 65536/64000 = 128/125 = 1.024, so that the decoder works by multiplying by 40 (shift, shift, add, shift, shift) instead of dividing by 40. The principle is that of arithmetic coding.


The encoder will run on a non-6502 computer, so a faster decoder would be wonderful. I stared at this for a bit and couldn't figure out how it works. Given "DOG" which encodes to 4, 15, 7, or 4*1600+15*40+7 = 7007, I multiply that *1.024 and wind up with 7175.168. I don't have the luxury of storing the fractional portion, so iteratively multiplying 7175*40 and dividing by 65536 results in 4, 15, 6.8359375

I see where you're going with it, but how does it translate into an algorithm that yields decodable output?


Top 	
 Profile  
 
tepples 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Tue Jul 08, 2014 7:52 pm 
Offline

Joined: Sun Sep 19, 2004 11:12 pm
Posts: 20704
Location: NE Indiana, USA (NTSC) 	
Try rounding the multiplication by 1.024 up using ceil().


Top 	
 Profile  
 
chitselb 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Wed Jul 09, 2014 7:12 am 
Offline

Joined: Sat Jun 14, 2014 2:14 pm
Posts: 6 	
tepples wrote:
Try rounding the multiplication by 1.024 up using ceil().


I played around with it in a spreadsheet and there are inaccuracies. It would probaby work if more bits were available for the fractions, but in Rad50 they just aren't there.


Top 	
 Profile  
 
lidnariq 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Wed Jul 09, 2014 2:01 pm 
Offline

Joined: Sun Apr 13, 2008 11:12 am
Posts: 7695
Location: Seattle 	
Worked for me:
Code:
#!/usr/bin/perl
use POSIX 'ceil';
for ($x = 0; $x < 40; $x++) {
   for ($y = 0; $y < 40; $y++) {
      for ($z = 0; $z < 40; $z++) {
         my $sum = $x*1600 + $y*40 + $z;
         $sum = ceil($sum * 1.024);

         $sum *= 40; $a = $sum >> 16; $sum &= 0xFFFF;
         $sum *= 40; $b = $sum >> 16; $sum &= 0xFFFF;
         $c = ($sum * 40) >> 16;
         if ($x != $a or $y != $b or $z != $c) {
            printf "%2d=%2d %2d=%2d %2d=%2d\n",$x,$a,$y,$b,$z,$c;
         }
      }
   }
}


-----------------------------------------------------------------------------------------


Unsigned Integer Division Routines

Moderator: Moderators


Post new topic Reply to topic 	 Page 3 of 3
	 [ 31 posts ]  	Go to page Previous  1, 2, 3
  Print view 	Previous topic | Next topic 
Author 	Message
Omegamatrix 	
 Post subject: Re: Unsigned Integer Division Routines
PostPosted: Wed Jul 09, 2014 2:34 pm 
Offline
User avatar

Joined: Tue Jun 10, 2014 8:15 pm
Posts: 35
Location: Canada 	
chitselb are you using 16 bit? I do have a 16 bit divide by 10 routine that you could tweak to get a div 40 and mod 40. That being said if you can just use multiply 40 then you probably way better off.


Code:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;    UNSIGNED DIVIDE BY 10 (16 BIT)
;    By Omegamatrix
;    126 cycles (max), 79 bytes
;
;    Start with 16 bit value in counterHigh, counterLow
;    End with 16 bit result in highTen, lowTen
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

TensRemaining:
    .byte 0,25,51,76,102,128,153,179,204,230
ModRemaing:
    .byte 0,6,2,8,4,0,6,2,8,4

.startDivideBy10:
    ldy    #-2                   ;2  @2   skips a branch the first time through
    lda    counterHigh           ;3  @5
.do8bitDiv:
    sta    temp                  ;3  @8
    lsr                          ;2  @10
    adc    #13                   ;2  @12
    adc    temp                  ;3  @15
    ror                          ;2  @17
    lsr                          ;2  @19
    lsr                          ;2  @21
    adc    temp                  ;3  @24
    ror                          ;2  @26
    adc    temp                  ;3  @29
    ror                          ;2  @31
    lsr                          ;2  @33
    and    #$7C                  ;2  @35   AND'ing here...
    sta    temp                  ;3  @38   and saving result as highTen (times 4)
    lsr                          ;2  @40
    lsr                          ;2  @42
    iny                          ;2  @44
    bpl    .finishLowTen         ;2� @46/47...120

    sta    highTen               ;3  @49
    adc    temp                  ;3  @52   highTen (times 5)
    asl                          ;2  @54   highTen (times 10)
    sbc    counterHigh           ;3  @57
    eor    #$FF                  ;2  @59
    tay                          ;2  @61   mod 10 result!

    lda    TensRemaining,Y       ;4  @65   Fill the low byte with the tens it should
    sta    lowTen                ;3  @68   have at this point from the high byte divide.
    lda    counterLow            ;3  @71
    adc    ModRemaing,Y          ;4  @75
    bcc    .do8bitDiv            ;2� @77/78

.overflowFound:
    cmp    #4                    ;2  @79   We have overflowed, but we can apply a shortcut.
    lda    #25                   ;2  @81   Divide by 10 will be at least 25, and the
                                 ;         carry is set when higher for the next addition.
.finishLowTen:
    adc    lowTen                ;3  @123
    sta    lowTen                ;3  @126  routine ends at either 87 or 126 cycles