summaryrefslogtreecommitdiff
path: root/pcre/pcre_scanner.h
blob: 5617e4515cb9620ab1803ab968c1b0feb6a740be (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
// Copyright (c) 2005, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author: Sanjay Ghemawat
//
// Regular-expression based scanner for parsing an input stream.
//
// Example 1: parse a sequence of "var = number" entries from input:
//
//      Scanner scanner(input);
//      string var;
//      int number;
//      scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter
//      while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) {
//        ...;
//      }

#ifndef _PCRE_SCANNER_H
#define _PCRE_SCANNER_H

#include <assert.h>
#include <string>
#include <vector>

#include <pcrecpp.h>
#include <pcre_stringpiece.h>

namespace pcrecpp {

class PCRECPP_EXP_DEFN Scanner {
 public:
  Scanner();
  explicit Scanner(const std::string& input);
  ~Scanner();

  // Return current line number.  The returned line-number is
  // one-based.  I.e. it returns 1 + the number of consumed newlines.
  //
  // Note: this method may be slow.  It may take time proportional to
  // the size of the input.
  int LineNumber() const;

  // Return the byte-offset that the scanner is looking in the
  // input data;
  int Offset() const;

  // Return true iff the start of the remaining input matches "re"
  bool LookingAt(const RE& re) const;

  // Return true iff all of the following are true
  //    a. the start of the remaining input matches "re",
  //    b. if any arguments are supplied, matched sub-patterns can be
  //       parsed and stored into the arguments.
  // If it returns true, it skips over the matched input and any
  // following input that matches the "skip" regular expression.
  bool Consume(const RE& re,
               const Arg& arg0 = RE::no_arg,
               const Arg& arg1 = RE::no_arg,
               const Arg& arg2 = RE::no_arg
               // TODO: Allow more arguments?
               );

  // Set the "skip" regular expression.  If after consuming some data,
  // a prefix of the input matches this RE, it is automatically
  // skipped.  For example, a programming language scanner would use
  // a skip RE that matches white space and comments.
  //
  //    scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/");
  //
  // Skipping repeats as long as it succeeds.  We used to let people do
  // this by writing "(...)*" in the regular expression, but that added
  // up to lots of recursive calls within the pcre library, so now we
  // control repetition explicitly via the function call API.
  //
  // You can pass NULL for "re" if you do not want any data to be skipped.
  void Skip(const char* re);   // DEPRECATED; does *not* repeat
  void SetSkipExpression(const char* re);

  // Temporarily pause "skip"ing. This
  //   Skip("Foo"); code ; DisableSkip(); code; EnableSkip()
  // is similar to
  //   Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo");
  // but avoids creating/deleting new RE objects.
  void DisableSkip();

  // Reenable previously paused skipping.  Any prefix of the input
  // that matches the skip pattern is immediately dropped.
  void EnableSkip();

  /***** Special wrappers around SetSkip() for some common idioms *****/

  // Arranges to skip whitespace, C comments, C++ comments.
  // The overall RE is a disjunction of the following REs:
  //    \\s                     whitespace
  //    //.*\n                  C++ comment
  //    /[*](.|\n)*?[*]/        C comment (x*? means minimal repetitions of x)
  // We get repetition via the semantics of SetSkipExpression, not by using *
  void SkipCXXComments() {
    SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/");
  }

  void set_save_comments(bool comments) {
    save_comments_ = comments;
  }

  bool save_comments() {
    return save_comments_;
  }

  // Append to vector ranges the comments found in the
  // byte range [start,end] (inclusive) of the input data.
  // Only comments that were extracted entirely within that
  // range are returned: no range splitting of atomically-extracted
  // comments is performed.
  void GetComments(int start, int end, std::vector<StringPiece> *ranges);

  // Append to vector ranges the comments added
  // since the last time this was called. This
  // functionality is provided for efficiency when
  // interleaving scanning with parsing.
  void GetNextComments(std::vector<StringPiece> *ranges);

 private:
  std::string   data_;          // All the input data
  StringPiece   input_;         // Unprocessed input
  RE*           skip_;          // If non-NULL, RE for skipping input
  bool          should_skip_;   // If true, use skip_
  bool          skip_repeat_;   // If true, repeat skip_ as long as it works
  bool          save_comments_; // If true, aggregate the skip expression

  // the skipped comments
  // TODO: later consider requiring that the StringPieces be added
  // in order by their start position
  std::vector<StringPiece> *comments_;

  // the offset into comments_ that has been returned by GetNextComments
  int           comments_offset_;

  // helper function to consume *skip_ and honour
  // save_comments_
  void ConsumeSkip();
};

}   // namespace pcrecpp

#endif /* _PCRE_SCANNER_H */