Replace p-limit with own implementation

p-limit isn't very performant, does a lot of garbage collection amongst other things.

Given how fundamental it is to tinybench running "well", we should probably just have our own implementation we can tailor to tinybench's usage.

Something like this would be a start:

export const withConcurrency = <TParams extends unknown[], TReturn>(
  fn: (...args: TParams) => Promise<TReturn>,
  limit: number
): {
    readonly activeCount: number;
    readonly pendingCount: number;
    queue: (...args: TParams) => void;
    run: () => Promise<TReturn[]>;
  } => {
  const queue: TParams[] = []
  const results: TReturn[] = []
  let activeCount = 0
  let pendingCount = 0
  let index = 0
  const worker = async () => {
    while (index < queue.length) {
      const currentIndex = index++
      // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
      const args = queue[currentIndex]!
      activeCount++
      pendingCount--
      const result = await fn(...args)
      results[currentIndex] = result
    }
  }
  return {
    get activeCount () {
      return activeCount
    },
    get pendingCount () {
      return pendingCount
    },
    queue: (...args: TParams) => {
      queue.push(args)
      pendingCount++
    },
    run: async () => {
      const workerCount = Math.min(limit, queue.length)
      await Promise.all(Array.from({ length: workerCount }, worker))
      return results
    }
  }
}