From c70670bacb092041f33e1dda17cce584b8ce1fc6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 24 Aug 2023 11:41:07 +0200 Subject: [PATCH] (common) New UrlIdCodec class Have a single class responsible for encoding and decoding URL ids, as it's a bit finicky and used all over. --- .../nu/marginalia/model/id/UrlIdCodec.java | 94 +++++++++++++++++++ .../marginalia/model/id/UrlIdCodecTest.java | 37 ++++++++ 2 files changed, 131 insertions(+) create mode 100644 code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java create mode 100644 code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java diff --git a/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java b/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java new file mode 100644 index 00000000..86c8deac --- /dev/null +++ b/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java @@ -0,0 +1,94 @@ +package nu.marginalia.model.id; + +/** URL id encoding scheme, including an optional ranking part that's used in the indices and washed away + * outside. The ranking part is put in the highest bits so that when we sort the documents by id, they're + * actually sorted by rank. Next is the domain id part, which keeps documents from the same domain clustered. + * Finally is the document ordinal part, which is a non-unique sequence number for within the current set of + * documents loaded. The same ID may be re-used over time as a new index is loaded. + *

+ * + * + * + * + * + * + * + * + * + * + * + *
PartBitsCardinality
rank6 bits64
domain31 bits2 billion
document26 bits67 million
+ *

+ * Most significant bit is unused for now because I'm not routing Long.compareUnsigned() all over the codebase. + * If we end up needing more domains, we'll cross that bridge when we come to it. + * + *

Coding Scheme

+ *
+ * [    | rank | domain | url ]
+ *  0   1       6       38    64
+ * 
+ */ +public class UrlIdCodec { + private static final long RANK_MASK = 0x8600_0000_0000_0000L; + private static final int DOCORD_MASK = 0x03FF_FFFF; + + /** Encode a URL id without a ranking element */ + public static long encodeId(int domainId, int documentOrdinal) { + domainId &= 0x7FFF_FFFFL; + documentOrdinal &= 0x03FF_FFFF; + + return ((long) domainId << 26) | documentOrdinal; + } + + /** Encode a URL id with the optional ranking part + * + * @param rank [0,1] the importance of the domain, low is good + * @param domainId + * @param documentOrdinal + * @return + */ + public static long encodeIdWithRank(float rank, int domainId, int documentOrdinal) { + long rankPart = (int)(rank * (1<<6)); + + if (rankPart >= 64) rankPart = 63; + if (rankPart < 0) rankPart = 0; + + return encodeId(domainId, documentOrdinal) | (rankPart << 57); + } + + /** Add a ranking element to an existing combined URL id. + * + * @param rank [0,1] the importance of the domain, low is good + * @param urlId + */ + public static long addRank(float rank, long urlId) { + long rankPart = (int)(rank * (1<<6)); + + if (rankPart >= 64) rankPart = 63; + if (rankPart < 0) rankPart = 0; + + return (urlId&(~RANK_MASK)) | (rankPart << 57); + } + + /** Extract the domain component from this URL id */ + public static int getDomainId(long combinedId) { + return (int) ((combinedId >>> 26) & 0x7FFF_FFFFL); + } + + /** Extract the document ordinal component from this URL id */ + public static int getDocumentOrdinal(long combinedId) { + return (int) (combinedId & DOCORD_MASK); + } + + + /** Extract the document ordinal component from this URL id */ + public static int getRank(long combinedId) { + return (int) (combinedId >>> 57); + } + + /** Mask out the ranking element from this URL id */ + public static long removeRank(long combinedId) { + return combinedId & (~RANK_MASK); + } + +} diff --git a/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java b/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java new file mode 100644 index 00000000..10fda63b --- /dev/null +++ b/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java @@ -0,0 +1,37 @@ +package nu.marginalia.model.id; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class UrlIdCodecTest { + @Test + public void testDocumentBounds() { + long encoded = UrlIdCodec.encodeId(0, ~0); + assertEquals(0, UrlIdCodec.getDomainId(encoded)); + } + + @Test + public void testDomainBounds() { + long encoded = UrlIdCodec.encodeId(~0, 0); + assertEquals(0x7FFF_FFFF, UrlIdCodec.getDomainId(encoded)); + assertEquals(0, UrlIdCodec.getRank(encoded)); + assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded)); + } + + @Test + public void testRankBounds() { + long encoded = UrlIdCodec.encodeIdWithRank(1.0f, 0, 0); + assertEquals(0, UrlIdCodec.getDomainId(encoded)); + assertEquals(63, UrlIdCodec.getRank(encoded)); + assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded)); + } + + @Test + public void testRankBoundsNeg() { + long encoded = UrlIdCodec.encodeIdWithRank(-1.0f, 0, 0); + assertEquals(0, UrlIdCodec.getDomainId(encoded)); + assertEquals(0, UrlIdCodec.getRank(encoded)); + assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded)); + } +} \ No newline at end of file