class interface REGULAR_EXPRESSION
-- ePCRE - Perl-Compatible Regular Expressions written in Eiffel
--
-- havily based on the sources of the PCRE library package Version 3.4
-- which is open source software, written by Philip Hazel, and copyright
-- by the University of Cambridge, England.
-- ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/
-- PCRE is a library of functions to support regular expressions whose syntax
-- and semantics are as close as possible to those of the Perl 5 language.
--
-- (PCRE can also be found on www.sourceforge.com)
--
-- Ported from the C-Source to Eiffel by Harald Erdbrügger he@he-me.de
-- (hopefully no more gotos, continues, breaks and returns ;-) )
--
-- Version 1.01 2001-03-29 - option handling improved (most options can now
-- changed while the matcher is active (see assertions)
-- - error in next_match fixed
-- - some minor fixes / improvements
-- - improved localization (i.e. for german umlaute, see
-- example.e)
-- - fixed error in replace routines
-- - change the access to subexpession in the replace area.
-- now the syntax is \refno\ (i.e. \3\ to access the
-- matched part of the third subexpession
-- Version 1.0 2001-03-26 - first public release
--
-- For documentation see also the file pcre.txt from the original source
--
creation
make
-- Create new REGULAR_EXPRESSION object.
ensure
all_reset: not is_compiled and not has_matched and not is_matcher_active
feature(s) from RE_CHARACTER_TABLES
-- Access
-- each compiler and matcher run uses these character sets to classify a character
-- type. make your own changes to these sets if you would localize your compiler.
-- have in mind that changes will be shared by all compilers, so the changes
-- made manipulates the behaviour of all following compiler runs of all compiler!
Default_char_casing: RE_CASING_MAPS
--!! TBD
Default_word_set: RE_CHARACTER_SET
--!! TBD
feature(s) from RE_COMPILER
-- character casing
char_casing: RE_CASING_MAPS
word_set: RE_CHARACTER_SET
set_char_casing (a_casing_maps: RE_CASING_MAPS)
require
not is_compiled;
a_casing_maps /= Void
ensure
char_casing = a_casing_maps
set_word_set (a_word_set: RE_CHARACTER_SET)
require
not is_compiled;
a_word_set /= Void
ensure
word_set = a_word_set
feature(s) from RE_COMPILER
-- query
is_compiled: BOOLEAN
-- last compilation successfully?
compile_error: STRING
-- holds the state of the compile execution
error_offset: INTEGER
-- hold the position in pattern where the error is detected
-- Result <= 0 implies no valid position found
feature(s) from RE_COMPILER
-- public options not changeable after compilation
is_caseless: BOOLEAN
-- If this bit is set, letters in the pattern match both upper
-- and lower case letters. It is equivalent to Perl's /i
-- option.
is_extended: BOOLEAN
-- If this bit is set, whitespace data characters in the pat-
-- tern are totally ignored except when escaped or inside a
-- character class, and characters between an unescaped # out-
-- side a character class and the next newline character,
-- inclusive, are also ignored. This is equivalent to Perl's /x
-- option, and makes it possible to include comments inside
-- complicated patterns. Note, however, that this applies only
-- to data characters. Whitespace characters may never appear
-- within special character sequences in a pattern, for example
-- within the sequence (?( which introduces a conditional sub-
-- pattern.
is_greedy: BOOLEAN
-- This option inverts the "greediness" of the quantifiers so
-- that they are not greedy by default, but become greedy if
-- followed by "?". It is not compatible with Perl. It can also
-- be set by a (?U) option setting within the pattern.
feature(s) from RE_COMPILER
-- public options
is_multiline: BOOLEAN
-- By default, PCRE treats the subject string as consisting of
-- a single "line" of characters (even if it actually contains
-- several newlines). The "start of line" metacharacter (^)
-- matches only at the start of the string, while the "end of
-- line" metacharacter ($) matches only at the end of the
-- string, or before a terminating newline (unless
-- is_dollar_endonly is set). This is the same as Perl.
-- When is_multiline it is set, the "start of line" and "end
-- of line" constructs match immediately following or immedi-
-- ately before any newline in the subject string, respec-
-- tively, as well as at the very start and end. This is
-- equivalent to Perl's /m option. If there are no "\n" charac-
-- ters in a subject string, or no occurrences of ^ or $ in a
-- pattern, setting is_multiline has no effect.
is_dotall: BOOLEAN
-- If this bit is set, a dot metacharater in the pattern
-- matches all characters, including newlines. Without it, new-
-- lines are excluded. This option is equivalent to Perl's /s
-- option. A negative class such as [^a] always matches a new-
-- line character, independent of the setting of this option.
is_empty_allowed: BOOLEAN
-- An empty string is not considered to be a valid match if
-- this option is cleared. If there are alternatives in the pat-
-- tern, they are tried. If all the alternatives match the
-- empty string, the entire match fails. For example, if the
-- pattern a?b?
-- is applied to a string not beginning with "a" or "b", it
-- matches the empty string at the start of the subject. With
-- not is_empty_allowed, this match is not valid, so PCRE
-- searches further into the string for occurrences of "a" or
-- "b". Perl has no direct equivalent of is_empty_allowed, but
-- it does make a special case of a pattern match of the empty
-- string within its split() function, and when using the /g
-- modifier. It is possible to emulate Perl's behaviour after
-- matching a null string by first trying the match again at
-- the same offset with is_empty_allowed cleared, and then if
-- that fails by advancing the starting offset (see below) and
-- trying an ordinary match again.
is_dollar_endonly: BOOLEAN
-- If this bit is set, a dollar metacharacter in the pattern
-- matches only at the end of the subject string. Without this
-- option, a dollar also matches immediately before the final
-- character if it is a newline (but not before any other new-
-- lines). The is_dollar_endonly option is ignored if
-- is_multiline is set. There is no equivalent to this option
-- in Perl.
is_bol: BOOLEAN
-- If not set, the first character of the string is not the
-- beginning of a line, so the circumflex metacharacter should
-- not match before it. Clearing this without is_multiline (at
-- compiletime) causes circumflex never to match.
is_eol: BOOLEAN
-- If not set, the end of the string is not the end of a line,
-- so the dollar metacharacter should not match it nor (except
-- in multiline mode) a newline immediately before it. Setting
-- this without is_multiline (at compile time) causes dollar
-- never to match.
is_anchored: BOOLEAN
-- If this flag is set, the pattern is forced to be "anchored",
-- that is, it is constrained to match only at the start of the
-- string which is being searched (the "subject string"). This
-- effect can also be achieved by appropriate constructs in the
-- pattern itself (^), which is the only way to do it in Perl.
feature(s) from RE_COMPILER
-- public options setting
set_default_options
require
not is_compiled
set_caseless (a_state: BOOLEAN)
require
not is_compiled
ensure
is_caseless = a_state
set_extended (a_state: BOOLEAN)
require
not is_compiled
ensure
is_extended = a_state
set_greedy (a_state: BOOLEAN)
require
not is_compiled
ensure
is_greedy = a_state
set_multiline (a_state: BOOLEAN)
ensure
is_multiline = a_state
set_dotall (a_state: BOOLEAN)
ensure
is_dotall = a_state
set_empty_allowed (a_state: BOOLEAN)
ensure
is_empty_allowed = a_state
set_dollar_endonly (a_state: BOOLEAN)
ensure
is_dollar_endonly = a_state
set_bol (a_state: BOOLEAN)
ensure
is_bol = a_state
set_eol (a_state: BOOLEAN)
ensure
is_eol = a_state
set_anchored (a_state: BOOLEAN)
ensure
is_anchored = a_state
feature(s) from REGULAR_EXPRESSION
-- query matcher infos
subject: STRING
-- Actual subject to match
subject_start: INTEGER
--!! TBD
subject_end: INTEGER
-- Portion of interest of subject
is_matcher_active: BOOLEAN
--!! TBD
has_matched: BOOLEAN
--!! TBD
ensure
Result = (match_count > 0) --!! matched: Result implies match_count > 0
match_count: INTEGER
-- Number of matched patterns
-- Result > 1 implies there are matched (sub-) portions
matched_portion (a_n: INTEGER): STRING
-- returns the n-th matched portion of the last matched subject
-- 0 represent the whole matched portion
require
active_matcher: is_matcher_active;
valid_index: a_n >= 0 and then a_n < match_count
matched_portion_in (a_result: STRING; a_n: INTEGER)
-- Append the n-th matched portion of last matched subject to a_result
-- 0 represent the whole matched portion
require
active_matcher: is_matcher_active;
valid_index: a_n >= 0 and then a_n < match_count
start_of_portion (a_n: INTEGER): INTEGER
-- start position of the n-th subexpression
-- a_n = 0 represent the whole matched portion of the subject
-- if the result = 0 implies undefined subexpression
require
active_matcher: is_matcher_active;
valid_index: a_n >= 0 and then a_n < match_count
ensure
Result /= 0 implies Result >= subject_start;
Result <= subject_end
end_of_portion (a_n: INTEGER): INTEGER
-- end position of the n-th subexpression
-- a_n = 0 represent the whole matched expression
-- if the result = -1 implies undefined subexpression
require
active_matcher: is_matcher_active;
valid_index: a_n >= 0 and then a_n < match_count
ensure
Result >= - 1 and then Result <= subject_end
feature(s) from REGULAR_EXPRESSION
-- Element Change
reset
-- reset is needed only if options must be changed. for an option change
-- a recompilation is needed
ensure
all_reset: not is_compiled and not has_matched and not is_matcher_active
feature(s) from REGULAR_EXPRESSION
-- compiler commands
compile (a_pattern: STRING)
-- There are two different sets of meta-characters: those that
-- are recognized anywhere in the pattern except within square
-- brackets, and those that are recognized in square brackets.
-- Outside square brackets, the meta-characters are as follows:
-- \ general escape character with several uses
-- ^ assert start of subject (or line, in multiline mode)
-- $ assert end of subject (or line, in multiline mode)
-- . match any character except newline (by default)
-- [ start character class definition
-- | start of alternative branch
-- ( start subpattern
-- ) end subpattern
-- ? extends the meaning of (
-- also 0 or 1 quantifier
-- also quantifier minimizer
-- * 0 or more quantifier
-- + 1 or more quantifier
-- { start min/max quantifier
--
-- Part of a pattern that is in square brackets is called a
-- "character class". In a character class the only meta-
-- characters are:
-- \ general escape character
-- ^ negate the class, but only if the first character
-- - indicates character range
-- ] terminates the character class
feature(s) from REGULAR_EXPRESSION
-- matcher commands
match (a_subject: STRING)
-- sets the value of match_count and the infos of the first
-- matched portion (if any)
require
compiled: is_compiled;
valid_subject: a_subject /= Void
ensure
subject = a_subject;
subject_start = 1;
subject_end = a_subject.count;
is_matcher_active
match_substring (a_subject: STRING; a_from, a_to: INTEGER)
-- sets the value of match_count and the infos of the first
-- matched portion (if any)
require
compiled: is_compiled;
valid_subject: a_subject /= Void
ensure
subject = a_subject;
subject_start = a_from;
subject_end = a_to;
is_matcher_active
first_match
-- rewind the matcher to the first match (if any)
require
active_matcher: is_matcher_active
next_match
-- match next portion (if any)
require
active_matcher: is_matcher_active
feature(s) from REGULAR_EXPRESSION
-- replace commands
replacement (a_replacement: STRING): STRING
-- returns the replacement with evaluated references.
-- makes only sense, if you use reference to the matched subject portions \n\
-- where n is an number from a subexpression of the compiled pattern
-- if no references used, this feature returns a copy of a_replacement
require
active_matcher: is_matcher_active;
valid_replacement: a_replacement /= Void
replacement_in (a_result, a_replacement: STRING)
-- appends the replacement with evaluated references to a_result.
-- makes only sense, if you use reference to the matched subject portions \n\
-- where n is an number from a subexpression of the compiled pattern
-- if no references used, this feature puts a copy of a_replacement in a_result
require
active_matcher: is_matcher_active;
valid_result: a_result /= Void;
valid_replacement: a_replacement /= Void
replace (a_replacement: STRING): STRING
-- returns a new string with the result of the replace action. only the
-- actual matched portion would be replaced by a_replacement
require
active_matcher: is_matcher_active;
valid_replacement: a_replacement /= Void
replace_in (a_result, a_replacement: STRING)
-- append in a_result the result of the replace action. only the actual
-- matched portion would be replace by a_replacement (if any) else a copy
-- of the subject is appended to a_result
require
active_matcher: is_matcher_active;
valid_result: a_result /= Void;
valid_replacement: a_replacement /= Void
replace_all (a_replacement: STRING): STRING
-- returns a new string with the result of the replace action. replacement
-- takes place over the whole subject string
require
active_matcher: is_matcher_active;
valid_replacement: a_replacement /= Void
ensure
not has_matched
replace_all_in (a_result, a_replacement: STRING)
-- append in a_result the result of the replace action. replacement
-- takes place over the whole subject string
require
active_matcher: is_matcher_active;
valid_result: a_result /= Void;
valid_replacement: a_replacement /= Void
ensure
not has_matched
feature(s) from REGULAR_EXPRESSION
-- split commands
split: ARRAY[STRING]
-- splits the string subject in not matched portions of the
-- compiled pattern and returns a new array with all pieces in
require
active_matcher: is_matcher_active
ensure
not has_matched
split_in (a_collection: COLLECTION[STRING])
-- splits the string subject in not matched portions of the
-- compiled pattern and append all pieces in a_collection
require
active_matcher: is_matcher_active;
valid_collection: a_collection /= Void
ensure
not has_matched;
a_collection.count >= old a_collection.count
invariant
valid_first_character: not first_character.in_range(0,255) implies first_character = - 1;
valid_required_character: not required_character.in_range(0,255) implies required_character >= - 2;
is_matcher_active implies is_compiled;
end of REGULAR_EXPRESSION