feat(storage/dataflux): add range_splitter #10748 (#10899) · googleapis/google-cloud-go@d49da26
@@ -15,13 +15,16 @@
1515package dataflux
16161717import (
18+"fmt"
19+"math/big"
20+"sort"
1821"sync"
1922)
20232124// rangeSplitter specifies the a list and a map of sorted alphabets.
2225type rangeSplitter struct {
2326mu sync.Mutex
24-sortedAlphabet *[]rune
27+sortedAlphabet []rune
2528alphabetMap map[rune]int
2629}
2730@@ -31,12 +34,334 @@ type listRange struct {
3134endRange string
3235}
333637+// minimalIntRange specifies start and end range in base-10 form, along with the
38+// minimal string length for the split range strings.
39+type minimalIntRange struct {
40+startInteger *big.Int
41+endInteger *big.Int
42+minimalLength int
43+}
44+45+// generateSplitsOpts specifies the parameters needed to generate the split
46+// range strings.
47+type generateSplitsOpts struct {
48+minimalIntRange *minimalIntRange
49+numSplits int
50+startRange string
51+endRange string
52+}
53+3454// newRangeSplitter creates a new RangeSplitter with the given alphabets.
35-func newRangeSplitter(alphabet string) *rangeSplitter {
36-return &rangeSplitter{}
55+// RangeSplitter determines split points within a given range based on the given
56+// alphabets.
57+func newRangeSplitter(alphabet string) (*rangeSplitter, error) {
58+59+// Validate that we do not have empty alphabet passed in.
60+if len(alphabet) == 0 {
61+return nil, fmt.Errorf("no alphabet specified for the range splitter")
62+ }
63+// Sort the alphabet lexicographically and store a mapping of each alphabet
64+// to its index. We need a mapping for efficient index lookup in later operations.
65+sortedAlphabet := []rune(alphabet)
66+sortAlphabet(sortedAlphabet)
67+alphabetMap := constructAlphabetMap(sortedAlphabet)
68+69+return &rangeSplitter{
70+alphabetMap: alphabetMap,
71+sortedAlphabet: sortedAlphabet,
72+ }, nil
3773}
387439-// splitRange creates a given number of splits based on a provided start and end range.
75+// splitRange divides the provided start and end range into approximately equal
76+// subranges, returning the split points. An empty slice is returned if suitable
77+// splits cannot be determined. Please note that this method provides a rough
78+// estimate of split points, without ensuring precise even partitioning of the range.
79+// Additionally, the number of found splits might be fewer than requested if the
80+// algorithm struggles to find sufficient split points. If the start range is empty
81+// the algorithm assumes it to be sequence of smallest possible character and empty
82+// end range as sequence of highest possible characters.
83+//
84+// For example, sorted alphabet = {"a","b","c","d"}
85+// Input: startRange= "d", endRange= "", numSplits=2
86+//
87+// This will be converted from base-N to base-10 integers.
88+// While calculating base-10 integer, "a" will be appended to startRange
89+// and "d" will be appended to endRange until the difference between integers is
90+// more than number of splits.
91+// startInteger for "da" = 12, endInteger for "dd" = 15
92+//
93+// The splits points will be 13 and 14 in base-10. This will be converted back to
94+// base-N value and returned as split points:
95+// {"db","dc"}
96+4097func (rs *rangeSplitter) splitRange(startRange, endRange string, numSplits int) ([]string, error) {
41-return nil, nil
98+// Number of splits has to be at least one, otherwise it is not splittable.
99+if numSplits < 1 {
100+return nil, fmt.Errorf("number of splits should be at least 1, got %d", numSplits)
101+ }
102+103+// End range (if specified) has to be lexicographically greater than the start range
104+// for the range to be valid.
105+if len(endRange) != 0 && startRange >= endRange {
106+return nil, fmt.Errorf("start range %q cannot be lexicographically greater than end range %q", startRange, endRange)
107+ }
108+109+rs.addCharsToAlphabet([]rune(startRange))
110+rs.addCharsToAlphabet([]rune(endRange))
111+112+// Validate start range characters and convert into character array form.
113+startRangeCharArray, err := rs.convertRangeStringToArray(startRange)
114+if err != nil {
115+return nil, fmt.Errorf("unable to convert start range %q to array: %v", startRange, err)
116+ }
117+118+// Validate end range characters and convert into character array form.
119+endRangeCharArray, err := rs.convertRangeStringToArray(endRange)
120+if err != nil {
121+return nil, fmt.Errorf("unable to convert end range %q to array: %v", endRange, err)
122+ }
123+124+// Construct the final split ranges to be returned.
125+var splitPoints []string
126+127+// If the start and end string ranges are equal with padding, no splitting is
128+// necessary. In such cases, an empty array of split ranges is returned.
129+if rs.isRangeEqualWithPadding(startRangeCharArray, endRangeCharArray) {
130+return splitPoints, nil
131+ }
132+// Convert the range strings from base-N to base-10 and employ a greedy approach
133+// to determine the smallest splittable integer range difference.
134+minimalIntRange, err := rs.convertStringRangeToMinimalIntRange(
135+startRangeCharArray, endRangeCharArray, numSplits)
136+if err != nil {
137+return nil, fmt.Errorf("range splitting with start range %q and end range %q: %v",
138+startRange, endRange, err)
139+ }
140+141+// Generate the split points and return them.
142+splitPoints = rs.generateSplits(generateSplitsOpts{
143+startRange: startRange,
144+endRange: endRange,
145+numSplits: numSplits,
146+minimalIntRange: minimalIntRange,
147+ })
148+149+return splitPoints, nil
150+}
151+152+// generateSplits generates the split points by translating the start and end
153+// range strings into base-10 integers, performing a split within the integer
154+// domain, and then converting the splits back into strings. In essence, this
155+// operation resembles a base-N to base-10 conversion, followed by a split in
156+// base 10, and finally another base-10 to base-N conversion. In this scenario,
157+// N represents the size of the alphabet, with the character's position in the
158+// alphabet indicating the digit's value.
159+func (rs *rangeSplitter) generateSplits(opts generateSplitsOpts) []string {
160+161+startInteger := opts.minimalIntRange.startInteger
162+endInteger := opts.minimalIntRange.endInteger
163+minimalLength := opts.minimalIntRange.minimalLength
164+165+rangeDifference := new(big.Int).Sub(endInteger, startInteger)
166+167+var splitPoints []string
168+169+// The number of intervals is one more than the number of split points.
170+rangeInterval := new(big.Int).SetInt64(int64(opts.numSplits + 1))
171+172+for i := 1; i <= opts.numSplits; i++ {
173+// Combine the range interval and index to determine the split point in base-10 form.
174+rangeDiffWithIdx := new(big.Int).Mul(rangeDifference, big.NewInt(int64(i)))
175+rangeInterval := new(big.Int).Div(rangeDiffWithIdx, rangeInterval)
176+splitPoint := new(big.Int).Add(rangeInterval, startInteger)
177+178+// Convert the split point back from base-10 to base-N.
179+splitString := rs.convertIntToString(splitPoint, minimalLength)
180+181+// Due to the approximate nature on how the minimal int range is derived, we need to perform
182+// another validation to check to ensure each split point falls in valid range.
183+isGreaterThanStart := len(splitString) > 0 && splitString > opts.startRange
184+isLessThanEnd := len(opts.endRange) == 0 || (len(splitString) > 0 && splitString < opts.endRange)
185+if isGreaterThanStart && isLessThanEnd {
186+splitPoints = append(splitPoints, splitString)
187+ }
188+ }
189+return splitPoints
190+}
191+192+// sortAlphabet sorts the alphabets string lexicographically.
193+func sortAlphabet(unsortedAlphabet []rune) {
194+sort.Slice(unsortedAlphabet, func(i, j int) bool {
195+return unsortedAlphabet[i] < unsortedAlphabet[j]
196+ })
197+}
198+199+// constructAlphabetMap constructs a mapping from each character in the
200+// alphabets to its index in the alphabet array.
201+func constructAlphabetMap(alphabet []rune) map[rune]int {
202+alphabetMap := make(map[rune]int)
203+for i, char := range alphabet {
204+alphabetMap[char] = i
205+ }
206+return alphabetMap
207+}
208+209+// addCharsToAlphabet adds a character to the known alphabet.
210+func (rs *rangeSplitter) addCharsToAlphabet(characters []rune) {
211+rs.mu.Lock() // Acquire the lock
212+defer rs.mu.Unlock() // Release the lock when the function exits
213+allAlphabet := rs.sortedAlphabet
214+newChars := false
215+for _, char := range characters {
216+if _, exists := rs.alphabetMap[char]; !exists {
217+allAlphabet = append(allAlphabet, char)
218+newChars = true
219+ }
220+ }
221+if newChars {
222+sortAlphabet(allAlphabet)
223+rs.sortedAlphabet = allAlphabet
224+rs.alphabetMap = constructAlphabetMap(rs.sortedAlphabet)
225+ }
226+}
227+228+// isRangeEqualWithPadding checks if two range strings are identical. Equality
229+// encompasses any padding using the smallest alphabet character from the set.
230+func (rs *rangeSplitter) isRangeEqualWithPadding(startRange, endRange []rune) bool {
231+232+sortedAlphabet := rs.sortedAlphabet
233+234+// When the end range is unspecified, it's interpreted as a sequence of the
235+// highest possible characters. Consequently, they are not deemed equal.
236+// If start range has highest possible characters, then smaller characters
237+// are appended to start range to find split points.
238+if len(endRange) == 0 {
239+return false
240+ }
241+242+// Get the longer length of the two range strings.
243+maxLength := max(len(startRange), len(endRange))
244+245+smallestChar := sortedAlphabet[0]
246+247+// Loop through the string range.
248+for i := 0; i < maxLength; i++ {
249+250+// In cases where a character is absent at a specific position (due to a length
251+// difference), the position is padded with the smallest character in the alphabet.
252+charStart := charAtOrDefault(startRange, i, smallestChar)
253+charEnd := charAtOrDefault(endRange, i, smallestChar)
254+255+// As soon as we find a difference, we conclude the two strings are different.
256+if charStart != charEnd {
257+return false
258+ }
259+ }
260+// Otherwise, we conclude the two strings are equal.
261+return true
262+}
263+264+// charAtOrDefault returns the character at the specified position, or the default character if
265+// the position is out of bounds.
266+func charAtOrDefault(charArray []rune, position int, defaultChar rune) rune {
267+if position < 0 || position >= len(charArray) {
268+return defaultChar
269+ }
270+return (charArray)[position]
271+}
272+273+// convertStringRangeToMinimalIntRange gradually extends the start and end string
274+// range in base-10 representation, until the difference reaches a threshold
275+// suitable for splitting.
276+func (rs *rangeSplitter) convertStringRangeToMinimalIntRange(
277+startRange, endRange []rune, numSplits int) (*minimalIntRange, error) {
278+279+startInteger := big.NewInt(0)
280+endInteger := big.NewInt(0)
281+282+alphabetLength := len(rs.sortedAlphabet)
283+startChar := (rs.sortedAlphabet)[0]
284+endChar := (rs.sortedAlphabet)[alphabetLength-1]
285+286+endDefaultChar := startChar
287+if len(endRange) == 0 {
288+endDefaultChar = endChar
289+ }
290+291+for i := 0; ; i++ {
292+293+// Convert each character of the start range string into a big integer
294+// based on the alphabet system.
295+startPosition, err := rs.charPosition(charAtOrDefault(startRange, i, startChar))
296+if err != nil {
297+return nil, err
298+ }
299+startInteger.Mul(startInteger, big.NewInt(int64(alphabetLength)))
300+startInteger.Add(startInteger, big.NewInt(int64(startPosition)))
301+302+// Convert each character of the end range string into a big integer
303+// based on the alphabet system.
304+endPosition, err := rs.charPosition(charAtOrDefault(endRange, i, endDefaultChar))
305+if err != nil {
306+return nil, err
307+ }
308+endInteger.Mul(endInteger, big.NewInt(int64(alphabetLength)))
309+endInteger.Add(endInteger, big.NewInt(int64(endPosition)))
310+311+// Calculate the difference between the start and end range in big integer representation.
312+difference := new(big.Int).Sub(endInteger, startInteger)
313+314+// If the difference is bigger than the number of split points, we are done.
315+// In particular, the minimal length is one greater than the index (due to zero indexing).
316+if difference.Cmp(big.NewInt(int64(numSplits))) > 0 {
317+return &minimalIntRange{
318+startInteger: startInteger,
319+endInteger: endInteger,
320+minimalLength: i + 1,
321+ }, nil
322+ }
323+ }
324+}
325+326+// charPosition returns the index of the character in the alphabet set.
327+func (rs *rangeSplitter) charPosition(ch rune) (int, error) {
328+if idx, ok := rs.alphabetMap[ch]; ok {
329+return idx, nil
330+ }
331+return -1, fmt.Errorf("character %c is not found in the alphabet map %v", ch, rs.alphabetMap)
332+}
333+334+// convertRangeStringToArray transforms the range string into a rune slice while
335+// verifying the presence of each character in the alphabets.
336+func (rs *rangeSplitter) convertRangeStringToArray(rangeString string) ([]rune, error) {
337+for _, char := range rangeString {
338+if _, exists := rs.alphabetMap[char]; !exists {
339+return nil, fmt.Errorf("character %c in range string %q is not found in the alphabet array", char, rangeString)
340+ }
341+ }
342+characterArray := []rune(rangeString)
343+return characterArray, nil
344+}
345+346+// convertIntToString converts the split point from base-10 to base-N.
347+func (rs *rangeSplitter) convertIntToString(splitPoint *big.Int, stringLength int) string {
348+349+remainder := new(big.Int)
350+351+var splitChar []rune
352+alphabetSize := big.NewInt(int64(len(rs.sortedAlphabet)))
353+354+// Iterate through the split point and convert alphabet by alphabet.
355+for i := 0; i < stringLength; i++ {
356+remainder.Mod(splitPoint, alphabetSize)
357+splitPoint.Div(splitPoint, alphabetSize)
358+splitChar = append(splitChar, (rs.sortedAlphabet)[(int)(remainder.Int64())])
359+ }
360+361+// Reverse the converted alphabet order because we originally processed from right to left.
362+for i, j := 0, len(splitChar)-1; i < j; i, j = i+1, j-1 {
363+splitChar[i], splitChar[j] = splitChar[j], splitChar[i]
364+ }
365+366+return string(splitChar)
42367}