feat(storage/dataflux): add range_splitter #10748 (#10899)

feat(storage/dataflux): add range_splitter #10748 (#10899) · googleapis/google-cloud-go@d49da26

@@ -15,13 +15,16 @@

1515package dataflux16161717import (18+"fmt"19+"math/big"20+"sort"1821"sync"1922)20232124// rangeSplitter specifies the a list and a map of sorted alphabets.2225type rangeSplitter struct {2326mu sync.Mutex24-sortedAlphabet *[]rune27+sortedAlphabet []rune2528alphabetMap map[rune]int2629}2730

@@ -31,12 +34,334 @@ type listRange struct {

3134endRange string3235}333637+// minimalIntRange specifies start and end range in base-10 form, along with the38+// minimal string length for the split range strings.39+type minimalIntRange struct {40+startInteger *big.Int41+endInteger *big.Int42+minimalLength int43+}44+45+// generateSplitsOpts specifies the parameters needed to generate the split46+// range strings.47+type generateSplitsOpts struct {48+minimalIntRange *minimalIntRange49+numSplits int50+startRange string51+endRange string52+}53+3454// newRangeSplitter creates a new RangeSplitter with the given alphabets.35-func newRangeSplitter(alphabet string) *rangeSplitter {36-return &rangeSplitter{}55+// RangeSplitter determines split points within a given range based on the given56+// alphabets.57+func newRangeSplitter(alphabet string) (*rangeSplitter, error) {58+59+// Validate that we do not have empty alphabet passed in.60+if len(alphabet) == 0 {61+return nil, fmt.Errorf("no alphabet specified for the range splitter")62+ }63+// Sort the alphabet lexicographically and store a mapping of each alphabet64+// to its index. We need a mapping for efficient index lookup in later operations.65+sortedAlphabet := []rune(alphabet)66+sortAlphabet(sortedAlphabet)67+alphabetMap := constructAlphabetMap(sortedAlphabet)68+69+return &rangeSplitter{70+alphabetMap: alphabetMap,71+sortedAlphabet: sortedAlphabet,72+ }, nil3773}387439-// splitRange creates a given number of splits based on a provided start and end range.75+// splitRange divides the provided start and end range into approximately equal76+// subranges, returning the split points. An empty slice is returned if suitable77+// splits cannot be determined. Please note that this method provides a rough78+// estimate of split points, without ensuring precise even partitioning of the range.79+// Additionally, the number of found splits might be fewer than requested if the80+// algorithm struggles to find sufficient split points. If the start range is empty81+// the algorithm assumes it to be sequence of smallest possible character and empty82+// end range as sequence of highest possible characters.83+//84+// For example, sorted alphabet = {"a","b","c","d"}85+// Input: startRange= "d", endRange= "", numSplits=286+//87+// This will be converted from base-N to base-10 integers.88+// While calculating base-10 integer, "a" will be appended to startRange89+// and "d" will be appended to endRange until the difference between integers is90+// more than number of splits.91+// startInteger for "da" = 12, endInteger for "dd" = 1592+//93+// The splits points will be 13 and 14 in base-10. This will be converted back to94+// base-N value and returned as split points:95+// {"db","dc"}96+4097func (rs *rangeSplitter) splitRange(startRange, endRange string, numSplits int) ([]string, error) {41-return nil, nil98+// Number of splits has to be at least one, otherwise it is not splittable.99+if numSplits < 1 {100+return nil, fmt.Errorf("number of splits should be at least 1, got %d", numSplits)101+ }102+103+// End range (if specified) has to be lexicographically greater than the start range104+// for the range to be valid.105+if len(endRange) != 0 && startRange >= endRange {106+return nil, fmt.Errorf("start range %q cannot be lexicographically greater than end range %q", startRange, endRange)107+ }108+109+rs.addCharsToAlphabet([]rune(startRange))110+rs.addCharsToAlphabet([]rune(endRange))111+112+// Validate start range characters and convert into character array form.113+startRangeCharArray, err := rs.convertRangeStringToArray(startRange)114+if err != nil {115+return nil, fmt.Errorf("unable to convert start range %q to array: %v", startRange, err)116+ }117+118+// Validate end range characters and convert into character array form.119+endRangeCharArray, err := rs.convertRangeStringToArray(endRange)120+if err != nil {121+return nil, fmt.Errorf("unable to convert end range %q to array: %v", endRange, err)122+ }123+124+// Construct the final split ranges to be returned.125+var splitPoints []string126+127+// If the start and end string ranges are equal with padding, no splitting is128+// necessary. In such cases, an empty array of split ranges is returned.129+if rs.isRangeEqualWithPadding(startRangeCharArray, endRangeCharArray) {130+return splitPoints, nil131+ }132+// Convert the range strings from base-N to base-10 and employ a greedy approach133+// to determine the smallest splittable integer range difference.134+minimalIntRange, err := rs.convertStringRangeToMinimalIntRange(135+startRangeCharArray, endRangeCharArray, numSplits)136+if err != nil {137+return nil, fmt.Errorf("range splitting with start range %q and end range %q: %v",138+startRange, endRange, err)139+ }140+141+// Generate the split points and return them.142+splitPoints = rs.generateSplits(generateSplitsOpts{143+startRange: startRange,144+endRange: endRange,145+numSplits: numSplits,146+minimalIntRange: minimalIntRange,147+ })148+149+return splitPoints, nil150+}151+152+// generateSplits generates the split points by translating the start and end153+// range strings into base-10 integers, performing a split within the integer154+// domain, and then converting the splits back into strings. In essence, this155+// operation resembles a base-N to base-10 conversion, followed by a split in156+// base 10, and finally another base-10 to base-N conversion. In this scenario,157+// N represents the size of the alphabet, with the character's position in the158+// alphabet indicating the digit's value.159+func (rs *rangeSplitter) generateSplits(opts generateSplitsOpts) []string {160+161+startInteger := opts.minimalIntRange.startInteger162+endInteger := opts.minimalIntRange.endInteger163+minimalLength := opts.minimalIntRange.minimalLength164+165+rangeDifference := new(big.Int).Sub(endInteger, startInteger)166+167+var splitPoints []string168+169+// The number of intervals is one more than the number of split points.170+rangeInterval := new(big.Int).SetInt64(int64(opts.numSplits + 1))171+172+for i := 1; i <= opts.numSplits; i++ {173+// Combine the range interval and index to determine the split point in base-10 form.174+rangeDiffWithIdx := new(big.Int).Mul(rangeDifference, big.NewInt(int64(i)))175+rangeInterval := new(big.Int).Div(rangeDiffWithIdx, rangeInterval)176+splitPoint := new(big.Int).Add(rangeInterval, startInteger)177+178+// Convert the split point back from base-10 to base-N.179+splitString := rs.convertIntToString(splitPoint, minimalLength)180+181+// Due to the approximate nature on how the minimal int range is derived, we need to perform182+// another validation to check to ensure each split point falls in valid range.183+isGreaterThanStart := len(splitString) > 0 && splitString > opts.startRange184+isLessThanEnd := len(opts.endRange) == 0 || (len(splitString) > 0 && splitString < opts.endRange)185+if isGreaterThanStart && isLessThanEnd {186+splitPoints = append(splitPoints, splitString)187+ }188+ }189+return splitPoints190+}191+192+// sortAlphabet sorts the alphabets string lexicographically.193+func sortAlphabet(unsortedAlphabet []rune) {194+sort.Slice(unsortedAlphabet, func(i, j int) bool {195+return unsortedAlphabet[i] < unsortedAlphabet[j]196+ })197+}198+199+// constructAlphabetMap constructs a mapping from each character in the200+// alphabets to its index in the alphabet array.201+func constructAlphabetMap(alphabet []rune) map[rune]int {202+alphabetMap := make(map[rune]int)203+for i, char := range alphabet {204+alphabetMap[char] = i205+ }206+return alphabetMap207+}208+209+// addCharsToAlphabet adds a character to the known alphabet.210+func (rs *rangeSplitter) addCharsToAlphabet(characters []rune) {211+rs.mu.Lock() // Acquire the lock212+defer rs.mu.Unlock() // Release the lock when the function exits213+allAlphabet := rs.sortedAlphabet214+newChars := false215+for _, char := range characters {216+if _, exists := rs.alphabetMap[char]; !exists {217+allAlphabet = append(allAlphabet, char)218+newChars = true219+ }220+ }221+if newChars {222+sortAlphabet(allAlphabet)223+rs.sortedAlphabet = allAlphabet224+rs.alphabetMap = constructAlphabetMap(rs.sortedAlphabet)225+ }226+}227+228+// isRangeEqualWithPadding checks if two range strings are identical. Equality229+// encompasses any padding using the smallest alphabet character from the set.230+func (rs *rangeSplitter) isRangeEqualWithPadding(startRange, endRange []rune) bool {231+232+sortedAlphabet := rs.sortedAlphabet233+234+// When the end range is unspecified, it's interpreted as a sequence of the235+// highest possible characters. Consequently, they are not deemed equal.236+// If start range has highest possible characters, then smaller characters237+// are appended to start range to find split points.238+if len(endRange) == 0 {239+return false240+ }241+242+// Get the longer length of the two range strings.243+maxLength := max(len(startRange), len(endRange))244+245+smallestChar := sortedAlphabet[0]246+247+// Loop through the string range.248+for i := 0; i < maxLength; i++ {249+250+// In cases where a character is absent at a specific position (due to a length251+// difference), the position is padded with the smallest character in the alphabet.252+charStart := charAtOrDefault(startRange, i, smallestChar)253+charEnd := charAtOrDefault(endRange, i, smallestChar)254+255+// As soon as we find a difference, we conclude the two strings are different.256+if charStart != charEnd {257+return false258+ }259+ }260+// Otherwise, we conclude the two strings are equal.261+return true262+}263+264+// charAtOrDefault returns the character at the specified position, or the default character if265+// the position is out of bounds.266+func charAtOrDefault(charArray []rune, position int, defaultChar rune) rune {267+if position < 0 || position >= len(charArray) {268+return defaultChar269+ }270+return (charArray)[position]271+}272+273+// convertStringRangeToMinimalIntRange gradually extends the start and end string274+// range in base-10 representation, until the difference reaches a threshold275+// suitable for splitting.276+func (rs *rangeSplitter) convertStringRangeToMinimalIntRange(277+startRange, endRange []rune, numSplits int) (*minimalIntRange, error) {278+279+startInteger := big.NewInt(0)280+endInteger := big.NewInt(0)281+282+alphabetLength := len(rs.sortedAlphabet)283+startChar := (rs.sortedAlphabet)[0]284+endChar := (rs.sortedAlphabet)[alphabetLength-1]285+286+endDefaultChar := startChar287+if len(endRange) == 0 {288+endDefaultChar = endChar289+ }290+291+for i := 0; ; i++ {292+293+// Convert each character of the start range string into a big integer294+// based on the alphabet system.295+startPosition, err := rs.charPosition(charAtOrDefault(startRange, i, startChar))296+if err != nil {297+return nil, err298+ }299+startInteger.Mul(startInteger, big.NewInt(int64(alphabetLength)))300+startInteger.Add(startInteger, big.NewInt(int64(startPosition)))301+302+// Convert each character of the end range string into a big integer303+// based on the alphabet system.304+endPosition, err := rs.charPosition(charAtOrDefault(endRange, i, endDefaultChar))305+if err != nil {306+return nil, err307+ }308+endInteger.Mul(endInteger, big.NewInt(int64(alphabetLength)))309+endInteger.Add(endInteger, big.NewInt(int64(endPosition)))310+311+// Calculate the difference between the start and end range in big integer representation.312+difference := new(big.Int).Sub(endInteger, startInteger)313+314+// If the difference is bigger than the number of split points, we are done.315+// In particular, the minimal length is one greater than the index (due to zero indexing).316+if difference.Cmp(big.NewInt(int64(numSplits))) > 0 {317+return &minimalIntRange{318+startInteger: startInteger,319+endInteger: endInteger,320+minimalLength: i + 1,321+ }, nil322+ }323+ }324+}325+326+// charPosition returns the index of the character in the alphabet set.327+func (rs *rangeSplitter) charPosition(ch rune) (int, error) {328+if idx, ok := rs.alphabetMap[ch]; ok {329+return idx, nil330+ }331+return -1, fmt.Errorf("character %c is not found in the alphabet map %v", ch, rs.alphabetMap)332+}333+334+// convertRangeStringToArray transforms the range string into a rune slice while335+// verifying the presence of each character in the alphabets.336+func (rs *rangeSplitter) convertRangeStringToArray(rangeString string) ([]rune, error) {337+for _, char := range rangeString {338+if _, exists := rs.alphabetMap[char]; !exists {339+return nil, fmt.Errorf("character %c in range string %q is not found in the alphabet array", char, rangeString)340+ }341+ }342+characterArray := []rune(rangeString)343+return characterArray, nil344+}345+346+// convertIntToString converts the split point from base-10 to base-N.347+func (rs *rangeSplitter) convertIntToString(splitPoint *big.Int, stringLength int) string {348+349+remainder := new(big.Int)350+351+var splitChar []rune352+alphabetSize := big.NewInt(int64(len(rs.sortedAlphabet)))353+354+// Iterate through the split point and convert alphabet by alphabet.355+for i := 0; i < stringLength; i++ {356+remainder.Mod(splitPoint, alphabetSize)357+splitPoint.Div(splitPoint, alphabetSize)358+splitChar = append(splitChar, (rs.sortedAlphabet)[(int)(remainder.Int64())])359+ }360+361+// Reverse the converted alphabet order because we originally processed from right to left.362+for i, j := 0, len(splitChar)-1; i < j; i, j = i+1, j-1 {363+splitChar[i], splitChar[j] = splitChar[j], splitChar[i]364+ }365+366+return string(splitChar)42367}