;bin: sortandmergepostings: Overhaul for more robust determinism

* Avoids non-deterministic flip-flopping when the alphabetical account sort has multiple commodities
* Sorts postings commodities so commodities are in the same order across transactions
* Sorts postings with matching commodity by posting amount
This commit is contained in:
Caleb Maclennan 2025-11-17 12:32:57 +03:00 committed by Simon Michael
parent 9031612c30
commit 62092749ef
2 changed files with 129 additions and 33 deletions

View File

@ -233,12 +233,14 @@ $ watchaccounts -f time.journal client1 date:thismonth -l
### sortandmergepostings
[`sortandmergepostings`](https://github.com/simonmichael/hledger/blob/master/bin/sortandmergepostings)
is an adventuresome awk script intended to clean up and merge similar postings in a transaction
is an adventuresome AWK script intended to clean up and merge similar postings in a transaction
(see [original discussion](https://unix.stackexchange.com/questions/526995/re-order-lines-and-merge-others-based-on-a-specific-criteria/527004)).
It sorts postings so that positive ones are first, negative ones last.
Within each sign, postings are sorted alphabetically by account name.
Lastly if there are multiple postings to the same account in the same direction, it tries to merge them (by leaving some amounts blank).
Piping the output to `hledger print` can recalculate the missing amounts.
Within each sign, postings are sorted by commodity.
Within each commodity group, postings are sorted by amount.
Among identical amounts in the same group, postings are sorted alphabetically by account name.
Once sorted, if there are multiple postings to the same account in the same direction with the same commodity and comments, it tries to merge them (by leaving some amounts blank).
Subsequently piping the output to `hledger print` can recalculate the missing amounts.
Multiple runs might be needed to clean up all duplicates.
```cli
$ sortandmergepostings input.journal | hledger -f - print -x

View File

@ -3,9 +3,11 @@
#
# Passed a ledger file, this will:
# 1. Sort accretion postings before deductions
# 2. Sort postings by account alphabetically
# 3. Merge 1 set of postings with the same account and direction by clearing
# the amount field. Note all posting meta data must also match to merge.
# 3. Sort posting groups by commodity (descending for accretions, ascending for deductions)
# 3. Sort commodity groups by amount (descending)
# 2. Sort commodity groups by account name
# 3. Merge 1 set of postings with the same account, commodity, direction, and other
# meta data by clearing the commodity and amounts and reducing to a single posting.
#
# Suggested usage:
# $ sortandmergepostings journal.ledger | hledger -f - print -x
@ -13,37 +15,120 @@
# Given that each run will only merge and recalculate amounts on one account per
# transaction it may need to be run multiple times to fully normalize a ledger.
BEGIN { FS = "[[:space:]][[:space:]]+" }
BEGIN {
FS = "[[:space:]][[:space:]]+"
DATE = "([0-9]{4}-[0-9]{2}-[0-9]{2})"
KS = "___"
}
function extract_account(val) {
sub(/^[*!] /, "", val)
gsub(/^\(\)\[\]/, "", val)
return val
}
function extract_commodity(val) {
split(val, segs, / *[@=]+ */)
gsub(/[-[:digit:]., ]+/, "", segs[1])
return segs[1]
}
function extract_amount(val) {
split(val, segs, / *[@=]+ */)
gsub(/[^-[:digit:].,]+/, "", segs[1])
gsub(/[\.,]/, "_", segs[1])
v = gensub(/_([^_]+)$/, ".\\1", "1", segs[1])
gsub(/_/, "", v)
return v
}
function extract_direction(val) {
split(val, segs, / *=+ */)
gsub(/[^-]/, "", segs[1])
return segs[1]
}
function extract_date(val) {
posting_date = transaction_date
if (match($val, "date2?: *" DATE, tday)) {
posting_date = tday[1]
}
return posting_date
}
function make_posting_key(account, commodity, comment) {
posting_key = account commodity comment
return posting_key
}
function make_sort_key(date, account, firstamount, commodity, postingct) {
result = date
key[2] = account
key[3] = commodity
key[4] = firstamount
key[5] = postingct
for (i in key)
result = result KS key[i]
return result
}
function sort_keys(i1, v1, i2, v2, l, r) {
split(i1, a, KS)
split(i2, b, KS)
for (i in a) {
if (a[i] == b[i]) continue
if (i == 3) {
if (a[4] > 0) {
return a[i] > b[i] ? 1 : -1
} else {
return a[i] < b[i] ? 1 : -1
}
} else if (i == 4) {
return a[i] < b[i] ? 1 : -1
} else {
return a[i] > b[i] ? 1 : -1
}
}
return 0
}
function dump() {
an = asorti(accretions, as)
dn = asorti(deductions, ds)
an = asorti(accretions, as, "sort_keys")
dn = asorti(deductions, ds, "sort_keys")
for (i=1; i<=an; i++) {
postings[length(postings)+1] = accretions[as[i]]
}
for (i=1; i<=dn; i++) {
postings[length(postings)+1] = deductions[ds[i]]
}
if (inferred_posting) delete seen
for (i in postings) {
posting = postings[i]
split(posting, parts, FS)
currency = parts[3]
gsub(/[[:digit:]., ]+/, "", currency)
if (!inferred && (!merge || merge == parts[2]) && seen[parts[2] currency parts[4]]>1 && parts[3] !~ /@/) {
if (!merge) merged[i] = " " parts[2] " " parts[4]
merge = parts[2]
account = extract_account(parts[2])
commodity = extract_commodity(parts[3])
comment = parts[4]
posting_key = make_posting_key(account, commodity, comment)
as_inferred = " " account " " comment
if (seen[posting_key] < 2 || (inferred_commodity && inferred_commodity != commodity)) {
explicit_postings[i] = posting
} else {
merged[i] = posting
if (!inferred_posting || as_inferred == inferred_posting) {
inferred_posting = as_inferred
inferred_commodity = commodity
} else {
explicit_postings[i] = posting
}
}
}
for (i in merged) print merged[i]
if (inferred) print inferred
inferred = ""
merge = ""
for (i in explicit_postings) print explicit_postings[i]
if (inferred_posting) print inferred_posting
inferred_posting = ""
merged_key = ""
delete accretions
delete deductions
delete postings
delete merged
delete explicit_postings
delete seen
}
@ -59,38 +144,47 @@ END {
/^[^[:space:]]/ {
dump()
if (match($0, "^" DATE, tday)) {
transaction_date = tday[1]
}
print $0
next
}
{
postingct++
account = $2
posting = $0
account = extract_account($2)
commodity = extract_commodity($3)
amount = $3
comments = $4
currency = amount
gsub(/[[:digit:]., ]+/, "", currency)
sub(/^[*!] /, "", account)
firstamount = extract_amount($3)
direction = extract_direction($3)
comment = $4
date = extract_date($4)
sort_key = make_sort_key(date, account, firstamount, commodity, postingct)
}
# Immediately output transaction comments at the top
account ~ /^;/ {
print
next
}
# If amount is blank, this is our one and only allowed inferred amount
!amount {
inferred = $0
inferred_posting = posting
next
}
amount !~ /@/ {
seen[account currency comments]++
# If no rates or balance assertions, this is eligable for merging
amount !~ /[@=]/ {
seen[make_posting_key(account, commodity, comment)]++
}
amount !~ /-/ {
accretions[account postingct] = $0
direction !~ /-/ {
accretions[sort_key] = posting
}
amount ~ /-/ {
deductions[account postingct] = $0
direction ~ /-/ {
deductions[sort_key] = posting
}