feat(storage/dataflux): add range_splitter #10748 (#10899) · googleapis/google-cloud-go@d49da26

@@ -15,13 +15,16 @@

1515

package dataflux

16161717

import (

18+

"fmt"

19+

"math/big"

20+

"sort"

1821

"sync"

1922

)

20232124

// rangeSplitter specifies the a list and a map of sorted alphabets.

2225

type rangeSplitter struct {

2326

mu sync.Mutex

24-

sortedAlphabet *[]rune

27+

sortedAlphabet []rune

2528

alphabetMap map[rune]int

2629

}

2730

@@ -31,12 +34,334 @@ type listRange struct {

3134

endRange string

3235

}

333637+

// minimalIntRange specifies start and end range in base-10 form, along with the

38+

// minimal string length for the split range strings.

39+

type minimalIntRange struct {

40+

startInteger *big.Int

41+

endInteger *big.Int

42+

minimalLength int

43+

}

44+45+

// generateSplitsOpts specifies the parameters needed to generate the split

46+

// range strings.

47+

type generateSplitsOpts struct {

48+

minimalIntRange *minimalIntRange

49+

numSplits int

50+

startRange string

51+

endRange string

52+

}

53+3454

// newRangeSplitter creates a new RangeSplitter with the given alphabets.

35-

func newRangeSplitter(alphabet string) *rangeSplitter {

36-

return &rangeSplitter{}

55+

// RangeSplitter determines split points within a given range based on the given

56+

// alphabets.

57+

func newRangeSplitter(alphabet string) (*rangeSplitter, error) {

58+59+

// Validate that we do not have empty alphabet passed in.

60+

if len(alphabet) == 0 {

61+

return nil, fmt.Errorf("no alphabet specified for the range splitter")

62+

}

63+

// Sort the alphabet lexicographically and store a mapping of each alphabet

64+

// to its index. We need a mapping for efficient index lookup in later operations.

65+

sortedAlphabet := []rune(alphabet)

66+

sortAlphabet(sortedAlphabet)

67+

alphabetMap := constructAlphabetMap(sortedAlphabet)

68+69+

return &rangeSplitter{

70+

alphabetMap: alphabetMap,

71+

sortedAlphabet: sortedAlphabet,

72+

}, nil

3773

}

387439-

// splitRange creates a given number of splits based on a provided start and end range.

75+

// splitRange divides the provided start and end range into approximately equal

76+

// subranges, returning the split points. An empty slice is returned if suitable

77+

// splits cannot be determined. Please note that this method provides a rough

78+

// estimate of split points, without ensuring precise even partitioning of the range.

79+

// Additionally, the number of found splits might be fewer than requested if the

80+

// algorithm struggles to find sufficient split points. If the start range is empty

81+

// the algorithm assumes it to be sequence of smallest possible character and empty

82+

// end range as sequence of highest possible characters.

83+

//

84+

// For example, sorted alphabet = {"a","b","c","d"}

85+

// Input: startRange= "d", endRange= "", numSplits=2

86+

//

87+

// This will be converted from base-N to base-10 integers.

88+

// While calculating base-10 integer, "a" will be appended to startRange

89+

// and "d" will be appended to endRange until the difference between integers is

90+

// more than number of splits.

91+

// startInteger for "da" = 12, endInteger for "dd" = 15

92+

//

93+

// The splits points will be 13 and 14 in base-10. This will be converted back to

94+

// base-N value and returned as split points:

95+

// {"db","dc"}

96+4097

func (rs *rangeSplitter) splitRange(startRange, endRange string, numSplits int) ([]string, error) {

41-

return nil, nil

98+

// Number of splits has to be at least one, otherwise it is not splittable.

99+

if numSplits < 1 {

100+

return nil, fmt.Errorf("number of splits should be at least 1, got %d", numSplits)

101+

}

102+103+

// End range (if specified) has to be lexicographically greater than the start range

104+

// for the range to be valid.

105+

if len(endRange) != 0 && startRange >= endRange {

106+

return nil, fmt.Errorf("start range %q cannot be lexicographically greater than end range %q", startRange, endRange)

107+

}

108+109+

rs.addCharsToAlphabet([]rune(startRange))

110+

rs.addCharsToAlphabet([]rune(endRange))

111+112+

// Validate start range characters and convert into character array form.

113+

startRangeCharArray, err := rs.convertRangeStringToArray(startRange)

114+

if err != nil {

115+

return nil, fmt.Errorf("unable to convert start range %q to array: %v", startRange, err)

116+

}

117+118+

// Validate end range characters and convert into character array form.

119+

endRangeCharArray, err := rs.convertRangeStringToArray(endRange)

120+

if err != nil {

121+

return nil, fmt.Errorf("unable to convert end range %q to array: %v", endRange, err)

122+

}

123+124+

// Construct the final split ranges to be returned.

125+

var splitPoints []string

126+127+

// If the start and end string ranges are equal with padding, no splitting is

128+

// necessary. In such cases, an empty array of split ranges is returned.

129+

if rs.isRangeEqualWithPadding(startRangeCharArray, endRangeCharArray) {

130+

return splitPoints, nil

131+

}

132+

// Convert the range strings from base-N to base-10 and employ a greedy approach

133+

// to determine the smallest splittable integer range difference.

134+

minimalIntRange, err := rs.convertStringRangeToMinimalIntRange(

135+

startRangeCharArray, endRangeCharArray, numSplits)

136+

if err != nil {

137+

return nil, fmt.Errorf("range splitting with start range %q and end range %q: %v",

138+

startRange, endRange, err)

139+

}

140+141+

// Generate the split points and return them.

142+

splitPoints = rs.generateSplits(generateSplitsOpts{

143+

startRange: startRange,

144+

endRange: endRange,

145+

numSplits: numSplits,

146+

minimalIntRange: minimalIntRange,

147+

})

148+149+

return splitPoints, nil

150+

}

151+152+

// generateSplits generates the split points by translating the start and end

153+

// range strings into base-10 integers, performing a split within the integer

154+

// domain, and then converting the splits back into strings. In essence, this

155+

// operation resembles a base-N to base-10 conversion, followed by a split in

156+

// base 10, and finally another base-10 to base-N conversion. In this scenario,

157+

// N represents the size of the alphabet, with the character's position in the

158+

// alphabet indicating the digit's value.

159+

func (rs *rangeSplitter) generateSplits(opts generateSplitsOpts) []string {

160+161+

startInteger := opts.minimalIntRange.startInteger

162+

endInteger := opts.minimalIntRange.endInteger

163+

minimalLength := opts.minimalIntRange.minimalLength

164+165+

rangeDifference := new(big.Int).Sub(endInteger, startInteger)

166+167+

var splitPoints []string

168+169+

// The number of intervals is one more than the number of split points.

170+

rangeInterval := new(big.Int).SetInt64(int64(opts.numSplits + 1))

171+172+

for i := 1; i <= opts.numSplits; i++ {

173+

// Combine the range interval and index to determine the split point in base-10 form.

174+

rangeDiffWithIdx := new(big.Int).Mul(rangeDifference, big.NewInt(int64(i)))

175+

rangeInterval := new(big.Int).Div(rangeDiffWithIdx, rangeInterval)

176+

splitPoint := new(big.Int).Add(rangeInterval, startInteger)

177+178+

// Convert the split point back from base-10 to base-N.

179+

splitString := rs.convertIntToString(splitPoint, minimalLength)

180+181+

// Due to the approximate nature on how the minimal int range is derived, we need to perform

182+

// another validation to check to ensure each split point falls in valid range.

183+

isGreaterThanStart := len(splitString) > 0 && splitString > opts.startRange

184+

isLessThanEnd := len(opts.endRange) == 0 || (len(splitString) > 0 && splitString < opts.endRange)

185+

if isGreaterThanStart && isLessThanEnd {

186+

splitPoints = append(splitPoints, splitString)

187+

}

188+

}

189+

return splitPoints

190+

}

191+192+

// sortAlphabet sorts the alphabets string lexicographically.

193+

func sortAlphabet(unsortedAlphabet []rune) {

194+

sort.Slice(unsortedAlphabet, func(i, j int) bool {

195+

return unsortedAlphabet[i] < unsortedAlphabet[j]

196+

})

197+

}

198+199+

// constructAlphabetMap constructs a mapping from each character in the

200+

// alphabets to its index in the alphabet array.

201+

func constructAlphabetMap(alphabet []rune) map[rune]int {

202+

alphabetMap := make(map[rune]int)

203+

for i, char := range alphabet {

204+

alphabetMap[char] = i

205+

}

206+

return alphabetMap

207+

}

208+209+

// addCharsToAlphabet adds a character to the known alphabet.

210+

func (rs *rangeSplitter) addCharsToAlphabet(characters []rune) {

211+

rs.mu.Lock() // Acquire the lock

212+

defer rs.mu.Unlock() // Release the lock when the function exits

213+

allAlphabet := rs.sortedAlphabet

214+

newChars := false

215+

for _, char := range characters {

216+

if _, exists := rs.alphabetMap[char]; !exists {

217+

allAlphabet = append(allAlphabet, char)

218+

newChars = true

219+

}

220+

}

221+

if newChars {

222+

sortAlphabet(allAlphabet)

223+

rs.sortedAlphabet = allAlphabet

224+

rs.alphabetMap = constructAlphabetMap(rs.sortedAlphabet)

225+

}

226+

}

227+228+

// isRangeEqualWithPadding checks if two range strings are identical. Equality

229+

// encompasses any padding using the smallest alphabet character from the set.

230+

func (rs *rangeSplitter) isRangeEqualWithPadding(startRange, endRange []rune) bool {

231+232+

sortedAlphabet := rs.sortedAlphabet

233+234+

// When the end range is unspecified, it's interpreted as a sequence of the

235+

// highest possible characters. Consequently, they are not deemed equal.

236+

// If start range has highest possible characters, then smaller characters

237+

// are appended to start range to find split points.

238+

if len(endRange) == 0 {

239+

return false

240+

}

241+242+

// Get the longer length of the two range strings.

243+

maxLength := max(len(startRange), len(endRange))

244+245+

smallestChar := sortedAlphabet[0]

246+247+

// Loop through the string range.

248+

for i := 0; i < maxLength; i++ {

249+250+

// In cases where a character is absent at a specific position (due to a length

251+

// difference), the position is padded with the smallest character in the alphabet.

252+

charStart := charAtOrDefault(startRange, i, smallestChar)

253+

charEnd := charAtOrDefault(endRange, i, smallestChar)

254+255+

// As soon as we find a difference, we conclude the two strings are different.

256+

if charStart != charEnd {

257+

return false

258+

}

259+

}

260+

// Otherwise, we conclude the two strings are equal.

261+

return true

262+

}

263+264+

// charAtOrDefault returns the character at the specified position, or the default character if

265+

// the position is out of bounds.

266+

func charAtOrDefault(charArray []rune, position int, defaultChar rune) rune {

267+

if position < 0 || position >= len(charArray) {

268+

return defaultChar

269+

}

270+

return (charArray)[position]

271+

}

272+273+

// convertStringRangeToMinimalIntRange gradually extends the start and end string

274+

// range in base-10 representation, until the difference reaches a threshold

275+

// suitable for splitting.

276+

func (rs *rangeSplitter) convertStringRangeToMinimalIntRange(

277+

startRange, endRange []rune, numSplits int) (*minimalIntRange, error) {

278+279+

startInteger := big.NewInt(0)

280+

endInteger := big.NewInt(0)

281+282+

alphabetLength := len(rs.sortedAlphabet)

283+

startChar := (rs.sortedAlphabet)[0]

284+

endChar := (rs.sortedAlphabet)[alphabetLength-1]

285+286+

endDefaultChar := startChar

287+

if len(endRange) == 0 {

288+

endDefaultChar = endChar

289+

}

290+291+

for i := 0; ; i++ {

292+293+

// Convert each character of the start range string into a big integer

294+

// based on the alphabet system.

295+

startPosition, err := rs.charPosition(charAtOrDefault(startRange, i, startChar))

296+

if err != nil {

297+

return nil, err

298+

}

299+

startInteger.Mul(startInteger, big.NewInt(int64(alphabetLength)))

300+

startInteger.Add(startInteger, big.NewInt(int64(startPosition)))

301+302+

// Convert each character of the end range string into a big integer

303+

// based on the alphabet system.

304+

endPosition, err := rs.charPosition(charAtOrDefault(endRange, i, endDefaultChar))

305+

if err != nil {

306+

return nil, err

307+

}

308+

endInteger.Mul(endInteger, big.NewInt(int64(alphabetLength)))

309+

endInteger.Add(endInteger, big.NewInt(int64(endPosition)))

310+311+

// Calculate the difference between the start and end range in big integer representation.

312+

difference := new(big.Int).Sub(endInteger, startInteger)

313+314+

// If the difference is bigger than the number of split points, we are done.

315+

// In particular, the minimal length is one greater than the index (due to zero indexing).

316+

if difference.Cmp(big.NewInt(int64(numSplits))) > 0 {

317+

return &minimalIntRange{

318+

startInteger: startInteger,

319+

endInteger: endInteger,

320+

minimalLength: i + 1,

321+

}, nil

322+

}

323+

}

324+

}

325+326+

// charPosition returns the index of the character in the alphabet set.

327+

func (rs *rangeSplitter) charPosition(ch rune) (int, error) {

328+

if idx, ok := rs.alphabetMap[ch]; ok {

329+

return idx, nil

330+

}

331+

return -1, fmt.Errorf("character %c is not found in the alphabet map %v", ch, rs.alphabetMap)

332+

}

333+334+

// convertRangeStringToArray transforms the range string into a rune slice while

335+

// verifying the presence of each character in the alphabets.

336+

func (rs *rangeSplitter) convertRangeStringToArray(rangeString string) ([]rune, error) {

337+

for _, char := range rangeString {

338+

if _, exists := rs.alphabetMap[char]; !exists {

339+

return nil, fmt.Errorf("character %c in range string %q is not found in the alphabet array", char, rangeString)

340+

}

341+

}

342+

characterArray := []rune(rangeString)

343+

return characterArray, nil

344+

}

345+346+

// convertIntToString converts the split point from base-10 to base-N.

347+

func (rs *rangeSplitter) convertIntToString(splitPoint *big.Int, stringLength int) string {

348+349+

remainder := new(big.Int)

350+351+

var splitChar []rune

352+

alphabetSize := big.NewInt(int64(len(rs.sortedAlphabet)))

353+354+

// Iterate through the split point and convert alphabet by alphabet.

355+

for i := 0; i < stringLength; i++ {

356+

remainder.Mod(splitPoint, alphabetSize)

357+

splitPoint.Div(splitPoint, alphabetSize)

358+

splitChar = append(splitChar, (rs.sortedAlphabet)[(int)(remainder.Int64())])

359+

}

360+361+

// Reverse the converted alphabet order because we originally processed from right to left.

362+

for i, j := 0, len(splitChar)-1; i < j; i, j = i+1, j-1 {

363+

splitChar[i], splitChar[j] = splitChar[j], splitChar[i]

364+

}

365+366+

return string(splitChar)

42367

}