(common) New UrlIdCodec class
Have a single class responsible for encoding and decoding URL ids, as it's a bit finicky and used all over.
This commit is contained in:
parent
7bb3e44a76
commit
c70670bacb
@ -0,0 +1,94 @@
|
||||
package nu.marginalia.model.id;
|
||||
|
||||
/** URL id encoding scheme, including an optional ranking part that's used in the indices and washed away
|
||||
* outside. The ranking part is put in the highest bits so that when we sort the documents by id, they're
|
||||
* actually sorted by rank. Next is the domain id part, which keeps documents from the same domain clustered.
|
||||
* Finally is the document ordinal part, which is a non-unique sequence number for within the current set of
|
||||
* documents loaded. The same ID may be re-used over time as a new index is loaded.
|
||||
* <p></p>
|
||||
* <table>
|
||||
* <tr><th>Part</th><th>Bits</th><th>Cardinality</th></tr>
|
||||
* <tr>
|
||||
* <td>rank</td><td>6 bits</td><td>64</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>domain</td><td>31 bits</td><td>2 billion</td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td>document</td><td>26 bits</td><td>67 million</td>
|
||||
* </tr>
|
||||
* </table>
|
||||
* <p></p>
|
||||
* Most significant bit is unused for now because I'm not routing Long.compareUnsigned() all over the codebase.
|
||||
* <i>If</i> we end up needing more domains, we'll cross that bridge when we come to it.
|
||||
*
|
||||
* <h2>Coding Scheme</h2>
|
||||
* <code><pre>
|
||||
* [ | rank | domain | url ]
|
||||
* 0 1 6 38 64
|
||||
* </pre></code>
|
||||
*/
|
||||
public class UrlIdCodec {
|
||||
private static final long RANK_MASK = 0x8600_0000_0000_0000L;
|
||||
private static final int DOCORD_MASK = 0x03FF_FFFF;
|
||||
|
||||
/** Encode a URL id without a ranking element */
|
||||
public static long encodeId(int domainId, int documentOrdinal) {
|
||||
domainId &= 0x7FFF_FFFFL;
|
||||
documentOrdinal &= 0x03FF_FFFF;
|
||||
|
||||
return ((long) domainId << 26) | documentOrdinal;
|
||||
}
|
||||
|
||||
/** Encode a URL id with the optional ranking part
|
||||
*
|
||||
* @param rank [0,1] the importance of the domain, low is good
|
||||
* @param domainId
|
||||
* @param documentOrdinal
|
||||
* @return
|
||||
*/
|
||||
public static long encodeIdWithRank(float rank, int domainId, int documentOrdinal) {
|
||||
long rankPart = (int)(rank * (1<<6));
|
||||
|
||||
if (rankPart >= 64) rankPart = 63;
|
||||
if (rankPart < 0) rankPart = 0;
|
||||
|
||||
return encodeId(domainId, documentOrdinal) | (rankPart << 57);
|
||||
}
|
||||
|
||||
/** Add a ranking element to an existing combined URL id.
|
||||
*
|
||||
* @param rank [0,1] the importance of the domain, low is good
|
||||
* @param urlId
|
||||
*/
|
||||
public static long addRank(float rank, long urlId) {
|
||||
long rankPart = (int)(rank * (1<<6));
|
||||
|
||||
if (rankPart >= 64) rankPart = 63;
|
||||
if (rankPart < 0) rankPart = 0;
|
||||
|
||||
return (urlId&(~RANK_MASK)) | (rankPart << 57);
|
||||
}
|
||||
|
||||
/** Extract the domain component from this URL id */
|
||||
public static int getDomainId(long combinedId) {
|
||||
return (int) ((combinedId >>> 26) & 0x7FFF_FFFFL);
|
||||
}
|
||||
|
||||
/** Extract the document ordinal component from this URL id */
|
||||
public static int getDocumentOrdinal(long combinedId) {
|
||||
return (int) (combinedId & DOCORD_MASK);
|
||||
}
|
||||
|
||||
|
||||
/** Extract the document ordinal component from this URL id */
|
||||
public static int getRank(long combinedId) {
|
||||
return (int) (combinedId >>> 57);
|
||||
}
|
||||
|
||||
/** Mask out the ranking element from this URL id */
|
||||
public static long removeRank(long combinedId) {
|
||||
return combinedId & (~RANK_MASK);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
package nu.marginalia.model.id;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class UrlIdCodecTest {
|
||||
@Test
|
||||
public void testDocumentBounds() {
|
||||
long encoded = UrlIdCodec.encodeId(0, ~0);
|
||||
assertEquals(0, UrlIdCodec.getDomainId(encoded));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDomainBounds() {
|
||||
long encoded = UrlIdCodec.encodeId(~0, 0);
|
||||
assertEquals(0x7FFF_FFFF, UrlIdCodec.getDomainId(encoded));
|
||||
assertEquals(0, UrlIdCodec.getRank(encoded));
|
||||
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRankBounds() {
|
||||
long encoded = UrlIdCodec.encodeIdWithRank(1.0f, 0, 0);
|
||||
assertEquals(0, UrlIdCodec.getDomainId(encoded));
|
||||
assertEquals(63, UrlIdCodec.getRank(encoded));
|
||||
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRankBoundsNeg() {
|
||||
long encoded = UrlIdCodec.encodeIdWithRank(-1.0f, 0, 0);
|
||||
assertEquals(0, UrlIdCodec.getDomainId(encoded));
|
||||
assertEquals(0, UrlIdCodec.getRank(encoded));
|
||||
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user