1#pragma once
2
3#include <string>
4#include <fcntl.h>
5
6class SegmentInput
7{
8 public:
9  explicit SegmentInput(const char* filename, int bufsize=kBufferSize)
10    : filename_(filename),
11      fd_(::open(filename, O_RDONLY)),
12      buffer_size_(bufsize),
13      data_(new char[buffer_size_])
14  {
15    refill();
16  }
17
18  const std::string& filename() const { return filename_; }
19  int64_t tell() const { return offset_; }
20  const std::string& current_word() const { return word_; }
21  int64_t current_count() const { return count_; }
22
23  bool next()
24  {
25    if (avail_ <= 0)
26      return false;
27    char* nl = static_cast<char*>(::memchr(start_, '\n', avail_));
28    if (nl)
29    {
30      char* tab = static_cast<char*>(::memchr(start_, '\t', nl - start_));
31      if (tab)
32      {
33        count_ = strtol(tab+1, NULL, 10);
34        word_ = std::string_view(start_, tab-start_);
35
36        int len = nl - start_ + 1;
37        avail_ -= len;
38        offset_ += len;
39        assert(avail_ >= 0);
40        if (avail_ == 0)
41        {
42          refill();
43        }
44        else
45        {
46          start_ += len;
47        }
48        return true;
49      }
50      else
51      {
52        avail_ = 0;
53        assert(0);
54        return false;
55      }
56    }
57    else
58    {
59      refill();
60      return next();
61    }
62  }
63
64 private:
65  void refill()
66  {
67    start_ = data_.get();
68    avail_ = ::pread(fd_, start_, buffer_size_, offset_);
69  }
70
71  const std::string filename_;
72  const int fd_;
73  const int buffer_size_;
74  int64_t offset_ = 0;  // file position
75
76  char* start_ = nullptr;
77  int avail_ = 0;
78  std::unique_ptr<char[]> data_;
79
80  std::string word_;
81  int64_t count_ = 0;
82
83  SegmentInput(const SegmentInput&) = delete;
84  void operator=(const SegmentInput&) = delete;
85};
86
87
88