# diff --- isolate differences between two files define (MAX_UNIQUE_LINES,6000) # no. of unique lines in all files define (MAX_UNIQUE_LINES2,12000) # must be twice MAX_UNIQUE_LINES define (NULL_POINTER,0) define (HASH_TABLE_SIZE,6073) # must be prime, as large as possible define (MAX_FILE_SIZE,6000) # no. of lines in largest input file define (MAXBUF,1024) # size of buffer for binary compare define (sym_pointer,integer) # large enough to index MAX_UNIQUE_LINES2 define (hash_index,integer) # large enough to index HASH_TABLE_SIZE define (file_mark,long_int) # large enough to hold a file position define (DIFFERENCES,1) # -d => list differences define (REVISION,2) # -r => revision bar requests for 'fmt' define (SCRIPT,3) # -s => update script for 'ed' define (COMPARISON,4) # -c => simple line-by-line compare define (BINARY_COMPARISON,5) # -b => simple word-by-word compare define (ON,) define (OFF,#) define (DEBUG,OFF) # turn debugging output on/off define (TUNING,OFF) # turn algorithm tuning output on/off include "diff_com.r.i" call initialize if (Option == COMPARISON) call simple_compare elif (Option == BINARY_COMPARISON) call binary_compare else { call load call pair call grow call label call report } stop end # enter --- enter a line in the symbol table, return its index sym_pointer function enter (line) character line (ARB) include "diff_com.r.i" hash_index h hash_index hash sym_pointer i, p file_mark markf character text (MAXLINE) integer equal h = hash (line) p = Bucket (h) while (p ~= NULL_POINTER) { i = Sym_store (p + 1) # grab index field of entry structure call seekf (Text_loc (i), Text_file, ABS) call getlin (text, Text_file) if (equal (line, text) == YES) return (i) # we got it; return its useful index p = Sym_store (p) # try next item in the chain DEBUG call print (ERROUT, "probing in lookup: ptr = *i, line = '*s'*n"s, DEBUG p, line) } if (Next_inx >= MAX_UNIQUE_LINES) call error ("too many unique lines; symbol table overflow"p) i = Next_inx Next_inx += 1 h = hash (line) Sym_store (Next_sym) = Bucket (h) # link in new entry Sym_store (Next_sym + 1) = i Bucket (h) = Next_sym Next_sym += 2 call wind (Text_file) Text_loc (i) = markf (Text_file) call putlin (line, Text_file) return (i) end # gen_listing --- generate a full listing of changes to a file subroutine gen_listing include "diff_com.r.i" sym_pointer oi, ni integer junk integer getlin character line (MAXLINE) procedure collect_unchanged forward procedure collect_deletions (ch) forward procedure collect_insertions (ch) forward oi = 2 ni = 2 repeat select when (Old_count (oi) == 0 && New_count (ni) == 0) collect_unchanged when (Old_count (oi) ~= 1 && New_count (ni) == 1) collect_insertions (' 'c) when (Old_count (oi) == 1 && New_count (ni) ~= 1) collect_deletions (' 'c) when (Old_count (oi) == 1 && New_count (ni) == 1) { collect_deletions ('c'c) collect_insertions ('c'c) } when (Old_count (oi) == 2 && New_count (ni) == 2) break return procedure collect_unchanged { if (Verbose == YES) call putch (NEWLINE, STDOUT) for (; Old_count (oi) == 0 && New_count (ni) == 0; {oi += 1; ni += 1}) { junk = getlin (line, Old_copy) if (Verbose == YES) { call print (STDOUT, " *4i *4i|"s, oi - 1, ni - 1) call putlin (line, STDOUT) } junk = getlin (line, New_copy) } } procedure collect_deletions (ch) { character ch call putch (NEWLINE, STDOUT) for (; Old_count (oi) == 1; oi += 1) { junk = getlin (line, Old_copy) if (Verbose == YES) call print (STDOUT, "*cd *4i *4x|"s, ch, oi - 1) else call print (STDOUT, "*cd *4io|"s, ch, oi - 1) call putlin (line, STDOUT) } } procedure collect_insertions (ch) { character ch call putch (NEWLINE, STDOUT) for (; New_count (ni) == 1; ni += 1) { junk = getlin (line, New_copy) if (Verbose == YES) call print (STDOUT, "*ci *4x *4i|"s, ch, ni - 1) else call print (STDOUT, "*ci *4in|"s, ch, ni - 1) call putlin (line, STDOUT) } } end # gen_revision --- generate 'fmt' input text with revision bar requests subroutine gen_revision include "diff_com.r.i" sym_pointer oi, ni integer junk integer getlin character line (MAXLINE) procedure collect_deletions forward procedure collect_insertions forward oi = 2 ni = 2 repeat select when (Old_count (oi) == 0 && New_count (ni) == 0) { oi += 1 ni += 1 junk = getlin (line, New_copy) call putlin (line, STDOUT) } when (Old_count (oi) ~= 1 && New_count (ni) == 1) { call print (STDOUT, "[cc]mc |*n"s) collect_insertions call print (STDOUT, "[cc]mc*n"s) } when (Old_count (oi) == 1 && New_count (ni) ~= 1) { call print (STDOUT, "[cc]mc ***n"s) call print (STDOUT, "[cc]mc*n"s) collect_deletions } when (Old_count (oi) == 1 && New_count (ni) == 1) { collect_deletions call print (STDOUT, "[cc]mc |*n"s) collect_insertions call print (STDOUT, "[cc]mc*n"s) } when (Old_count (oi) == 2 && New_count (ni) == 2) break return procedure collect_deletions { for (; Old_count (oi) == 1; oi += 1) ; } procedure collect_insertions { for (; New_count (ni) == 1; ni += 1) { junk = getlin (line, New_copy) call putlin (line, STDOUT) } } end # gen_script --- produce editor script to convert old file into new subroutine gen_script include "diff_com.r.i" sym_pointer oi, ni, offset, length integer junk integer getlin character line (MAXLINE) procedure collect_deletions forward procedure collect_insertions forward oi = 2 ni = 2 offset = 0 repeat select when (Old_count (oi) == 0 && New_count (ni) == 0) { oi += 1 ni += 1 junk = getlin (line, New_copy) } when (Old_count (oi) ~= 1 && New_count (ni) == 1) { call print (STDOUT, "*ia*n"s, oi - 2 + offset) collect_insertions call print (STDOUT, ".*n"s) offset += length } when (Old_count (oi) == 1 && New_count (ni) ~= 1) { collect_deletions call print (STDOUT, "*i,*id*n"s, oi - 1 - length + offset, oi - 2 + offset) offset -= length } when (Old_count (oi) == 1 && New_count (ni) == 1) { collect_deletions call print (STDOUT, "*i,*ic*n"s, oi - 1 - length + offset, oi - 2 + offset) offset -= length collect_insertions call print (STDOUT, ".*n"s) offset += length } when (Old_count (oi) == 2 && New_count (ni) == 2) break call print (STDOUT, "w*n"s) return procedure collect_deletions { length = 0 for (; Old_count (oi) == 1; oi += 1) length += 1 } procedure collect_insertions { length = 0 for (; New_count (ni) == 1; ni += 1) { length += 1 junk = getlin (line, New_copy) call putlin (line, STDOUT) } } end # grow --- grow unchanged blocks around unique line pairs subroutine grow include "diff_com.r.i" sym_pointer i, nx for (i = 1; i < New_size; i += 1) { nx = New_xref (i) if (nx > 0) # is this line paired with an old line? if (New_xref (i + 1) < 0 && New_xref (i + 1) == Old_xref (nx + 1)) { Old_xref (nx + 1) = i + 1 New_xref (i + 1) = nx + 1 } } for (i = New_size; i > 1; i -= 1) { nx = New_xref (i) if (nx > 0) # is this line paired? if (New_xref (i - 1) < 0 && New_xref (i - 1) == Old_xref (nx - 1)) { Old_xref (nx - 1) = i - 1 New_xref (i - 1) = nx - 1 } } return end # hash --- hash a line into a hash_index hash_index function hash (line) character line (ARB) integer i hash = 0 for (i = 1; line (i) ~= EOS; i += 1) hash += line (i) hash = mod (iabs (hash), HASH_TABLE_SIZE) + 1 return end # initialize --- set up everything needed for a file comparison subroutine initialize include "diff_com.r.i" file_des mktemp, open integer argno, i integer getarg character arg (MAXPATH) Option = DIFFERENCES # the defaults Verbose = NO argno = 1 # where we expect to find file names if (getarg (1, arg, MAXPATH) ~= EOF && arg (1) == '-'c) { call mapstr (arg, LOWER) for (i = 2; arg (i) ~= EOS; i += 1) select (arg (i)) when ('b'c) Option = BINARY_COMPARISON when ('c'c) Option = COMPARISON when ('d'c) Option = DIFFERENCES when ('r'c) Option = REVISION when ('s'c) Option = SCRIPT when ('v'c) Verbose = YES else call usage argno = 2 } if (getarg (argno, arg, MAXPATH) == EOF) { # no files, use STDIN Old_file = STDIN1 New_file = STDIN2 } else { Old_file = open (arg, READ) if (Old_file == ERR) call cant (arg) argno += 1 if (getarg (argno, arg, MAXPATH) == EOF) New_file = STDIN1 else { New_file = open (arg, READ) if (New_file == ERR) call cant (arg) argno += 1 } } if (getarg (argno, arg, MAXPATH) ~= EOF) call usage Next_inx = 1 Next_sym = 1 Text_file = mktemp (READWRITE) if (Text_file == ERR) call error ("can't open temporary file"p) Old_copy = mktemp (READWRITE) if (Old_copy == ERR) call error ("can't open temporary file"p) New_copy = mktemp (READWRITE) if (New_copy == ERR) call error ("can't open temporary file"p) return end # label --- label lines as "inserted," "deleted," or "unchanged" subroutine label include "diff_com.r.i" sym_pointer oi, ni, ox, nx DEBUG call print (STDOUT, "input new xref: "s) DEBUG do ni = 1, New_size DEBUG call print (STDOUT, " *i"s, New_xref (ni)) DEBUG call print (STDOUT, "*ninput old xref: "s) DEBUG do oi = 1, Old_size DEBUG call print (STDOUT, " *i"s, Old_xref (oi)) DEBUG call putch (NEWLINE, STDOUT) oi = 2 ni = 2 repeat { ox = Old_xref (oi) nx = New_xref (ni) select when (oi >= Old_size && ni >= New_size) break when (oi < Old_size && ox < 0) { # deletion from old file Old_count (oi) = 1 oi += 1 } when (ni < New_size && nx < 0) { # insertion in new file New_count (ni) = 1 ni += 1 } when (ox == ni && nx == oi) { # unchanged line Old_count (oi) = 0 oi += 1 New_count (ni) = 0 ni += 1 } when (oi <= Old_size && ni <= New_size) { # out-of-order block New_count (ni) = 1 ni += 1 Old_count (nx) = 1 Old_count (oi) = 1 oi += 1 New_count (ox) = 1 } else { call print (ERROUT, "Old_xref (*i) = *i, New_xref (*i) = *i*n"s, oi, ox, ni, nx) call error ("in label: can't happen"s) } } Old_count (1) = 2 # mark the null lines specially, Old_count (Old_size) = 2 # so people won't have to deal New_count (1) = 2 # with file sizes New_count (New_size) = 2 return end # load --- load symbol table, set up cross-reference structures subroutine load include "diff_com.r.i" sym_pointer lno, i sym_pointer enter hash_index h character line (MAXLINE) integer getlin TUNING sym_pointer p TUNING integer used, chain_len, max_chain_len, min_chain_len do h = 1, HASH_TABLE_SIZE Bucket (h) = NULL_POINTER do lno = 1, MAX_UNIQUE_LINES; { Old_count (lno) = 0 New_count (lno) = 0 } # Load the "old" file: for (lno = 2; getlin (line, Old_file) ~= EOF; lno += 1) { if (lno > MAX_FILE_SIZE) call error ("old file too large to handle"p) call putlin (line, Old_copy) i = enter (line) Old_count (i) += 1 Old_lno (i) = lno Old_xref (lno) = -i } Old_size = lno # includes null line at end # Load the "new" file: for (lno = 2; getlin (line, New_file) ~= EOF; lno += 1) { if (lno > MAX_FILE_SIZE) call error ("new file too large to handle"p) call putlin (line, New_copy) i = enter (line) New_count (i) += 1 New_xref (lno) = -i } New_size = lno # also allows for null line at end TUNING call print (STDOUT2, "Old_size = *i, New_size = *i*n"s, TUNING Old_size, New_size) TUNING call print (STDOUT2, "*i unique lines*n"s, Next_inx - 1) TUNING used = 0 TUNING max_chain_len = 0 TUNING min_chain_len = MAX_UNIQUE_LINES TUNING do h = 1, HASH_TABLE_SIZE; { TUNING p = Bucket (h) TUNING if (p ~= NULL_POINTER) TUNING used += 1 TUNING chain_len = 0 TUNING while (p ~= NULL_POINTER) { TUNING chain_len += 1 TUNING p = Sym_store (p) TUNING } TUNING max_chain_len = max0 (chain_len, max_chain_len) TUNING min_chain_len = min0 (chain_len, min_chain_len) TUNING } TUNING call print (STDOUT2, "chain lengths: min = *i, avg = *i, max = *i*n"s, TUNING min_chain_len, (Next_inx - 1) / used, max_chain_len) TUNING call print (STDOUT2, "hash buckets *i% full*n"s, TUNING (100*used)/HASH_TABLE_SIZE) return end # pair --- pair up unique lines in both files subroutine pair include "diff_com.r.i" sym_pointer i, j for (i = 2; i < New_size; i += 1) { j = -New_xref (i) if (Old_count (j) == 1 && New_count (j) == 1) { # unique pair New_xref (i) = Old_lno (j) Old_xref (Old_lno (j)) = i } } New_xref (1) = 1 # match null lines at BOF Old_xref (1) = 1 New_xref (New_size) = Old_size # ... and at EOF Old_xref (Old_size) = New_size return end # report --- report differences between files in desired format subroutine report include "diff_com.r.i" DEBUG sym_pointer i DEBUG call print (ERROUT, "New mark: "s) DEBUG do i = 1, New_size DEBUG call print (ERROUT, " *i"s, New_count (i)) DEBUG call print (ERROUT, "*nOld mark: "s) DEBUG do i = 1, Old_size DEBUG call print (ERROUT, " *i"s, Old_count (i)) DEBUG call putch (NEWLINE, ERROUT) call rewind (Old_copy) call rewind (New_copy) select (Option) when (DIFFERENCES) call gen_listing when (REVISION) call gen_revision when (SCRIPT) call gen_script else call error ("in report: can't happen"p) return end # simple_compare --- do a line-by-line comparison of the input files subroutine simple_compare include "diff_com.r.i" character line1 (MAXLINE), line2 (MAXLINE) integer lineno, m1, m2 integer equal, getlin lineno = 0 repeat { m1 = getlin (line1, Old_file) m2 = getlin (line2, New_file) if (m1 == EOF || m2 == EOF) break lineno += 1 if (equal (line1, line2) == NO) if (Verbose == YES) call print (STDOUT, "*5i*n*8x*s*8x*s"s, lineno, line1, line2) else { call print (STDOUT, "different*n"s) return } } if (m1 == m2) return if (Verbose == NO) call print (STDOUT, "different*n"s) elif (m1 == EOF) call print (STDOUT, "eof on old file*n"s) elif (m2 == EOF) call print (STDOUT, "eof on new file*n"s) return end # binary_compare --- compare files in binary subroutine binary_compare include "diff_com.r.i" integer m1, m2, i, len, buf1 (MAXBUF), buf2 (MAXBUF) integer readf longint block block = -1 # minus 1 to compensate for 1-based indexing repeat { m1 = readf (buf1, MAXBUF, Old_file) m2 = readf (buf2, MAXBUF, New_file) if (m1 ~= m2 || m1 == EOF) break do i = 1, m1 if (buf1 (i) ~= buf2 (i)) { if (Verbose == YES) call print (STDOUT, "*10l: *6,-8,0i *6,-8,0i*n"s, block + i, buf1 (i), buf2 (i)) else { call print (STDOUT, "different*n"s) return } } block += m1 } if (m1 == m2) # files are equal return if (Verbose == NO) { # unequal but we don't care how call print (STDOUT, "different*n"s) return } if (m1 ~= EOF && m2 ~= EOF) { # some words left to compare if (m1 < m2) { # old file is shorter than new one len = m1 m1 = EOF } else { # new file is shorter than old one len = m2 m2 = EOF } do i = 1, len # compare remaining words if (buf1 (i) ~= buf2 (i)) call print (STDOUT, "*10l: *6,-8,0i *6,-8,0i*n"s, block + i, buf1 (i), buf2 (i)) } if (m1 == EOF) call print (STDOUT, "eof on old file*n"s) elif (m2 == EOF) call print (STDOUT, "eof on new file*n"s) return end # usage --- print usage message, then die subroutine usage call error ("Usage: diff [-{b|c|d|r|s|v}] [old_file [new_file]]"p) end