local f = require('DataGenerator')
function test_data (data)
local i = 0
for _, patt in ipairs(data) do
--print(patt)
for _, k in ipairs(f(patt)) do
--print(k)
if string.match(k, patt) then
i = i + 1
print('ok ' .. i .. ' - "' .. k .. '" match /' .. patt .. '/')
end
end
end
end
local data = { 'bdede$xx$' }
test_data(data)
-- 单数组最大记录
local array_limit = 100
– + * 最大上限 10 { n, } 增加的限度
local max_rep_times = 10
– 数组连接,超出最大记录数后,每次递增的记录数
local step_length = 1
local unpack = table.unpack or unpack
local function error (…)
print(…)
os.exit()
end
– 将 {num} {num,}{num, num} 格式的字符串
– 解析成 数据格式,也就是一个数组
local function parse_quantifier_str (str)
local array = {}
local mode = 0
local num_str = ''
for char in string.gmatch(str, '.') do
-- 初始化模式
if mode == 0 then
-- 遇到开始符号,才能进入另外的模式
if char == '{' then
mode = 1
end
-- 开始解析
elseif mode == 1 then
-- 遇到字符,添加到缓冲,状态升级
if string.match(char, '%d') then
num_str = char
mode = 2
elseif not char == ' ' then
error('error:01:class difine with no digit numer')
end
elseif mode == 2 then
-- 接着遇到数字就添加进去
if string.match(char, '%d') then
num_str = num_str .. char
-- 如果遇到逗号,模式改变
elseif char == ',' then
table.insert(array, tonumber(num_str))
num_str = ''
mode = 3
elseif char == '}' then
-- {5} => {5, 5}
table.insert(array, tonumber(num_str))
table.insert(array, tonumber(num_str))
num_str = ''
break
-- 如果遇到除空格之外的字符,就是一个错误
elseif not char == ' ' then
error('error:02:quantifier with non digit char')
end
elseif mode == 3 then
-- 遇到结束符号,就退出了
-- 因为上个字符是逗号,所以就是 { num, } 模式
if char == '}' then
table.insert(array, array[1] + max_rep_times)
break
-- 如果是数字
elseif string.match(char,'%d') then
mode = 4
num_str = char
elseif not char == ' ' then
error('error:03: wrong char in class str')
end
-- 上次的符号是 , 逗号,
elseif mode == 4 then
-- 继续遇到数字的话,就继续添加
if string.match(char, '%d') then
num_str = num_str .. char
elseif char == '}' then
table.insert(array, tonumber(num_str))
break
elseif not char == ' ' then
error('error:04: wrong char in class str')
end
end
end
return array
end
local function concat_array (a1, a2)
local limit = limit or array_limit
local max_length = 1
if #a1 > max_length then max_length = #a1 end
if #a2 > max_length then max_length = #a2 end
if max_length > limit then limit = max_length + step_length end
count = 0
local a = {}
for _, v1 in ipairs(a1) do
for _, v2 in ipairs(a2) do
table.insert(a, v1 .. v2)
count = count + 1
end
if count > limit then break end
end
return a
end
– 将多个数组合并
local function concat_multi_array (array)
if #array == 1 then
return array[1]
elseif #array == 2 then
return concat_array(unpack(array))
elseif #array > 2 then
local a1 = table.remove(array)
local a2 = table.remove(array)
local a = concat_array(a2,a1)
table.insert(array, a)
return concat_multi_array(array)
end
end
local function concat_rep_array (array, rep_times)
local rep_array = {}
if rep_times == 0 then return { '' } end
if rep_times == 1 then return array end
for i = 1, rep_times do
table.insert(rep_array, array)
end
return concat_multi_array(rep_array)
end
– class and quantifier to array
local function cnq_to_array (a)
local combin_array = {}
for _, value in ipairs(a) do
local class, quant = unpack(value)
local array = {}
local from, to = unpack(quant)
for i = from, to do
local i_array = concat_rep_array(class, i)
-- 将生成的记录加入原有的记录中
for _, v in ipairs(i_array) do
table.insert(array, v)
end
end
table.insert(combin_array, array)
end
return combin_array
end
local count = 128
local char_id = {}
– 申请一个ID,从 128开始-。如果是有参数为 1
– 那么就同时生成一个 table 作为 char_id
local function apply_id_char (id, mode)
mode = mode or 0
count = count + 1
local char = string.char(count)
char_id[char] = id
if mode == 1 then char_id[char] = { id = id } end
return char
end
– 获取字符的原始字符
local function get_char_id (char)
local value = char_id[char]
if type(value) == 'table' then
return value.id
elseif type(value) == 'string' then
return value
else
error('error:05:not exists id records: ' .. char)
return nil
end
end
– 获取字符的代表字符集
local function get_char_class (char)
– if not get_char_id(char) then print(char) end
local array = {}
– 使用可见字符
for i = 0, 126 do
local i_char = string.char(i)
local patt = ''
if char == '.' then
patt = char
else
patt = '%' .. get_char_id(char)
end
if string.find(i_char, patt) then
table.insert(array, i_char)
end
end
return array
end
– 根据 class char of quantifier char get matched str with pos
local function get_char_str (char, pos)
local str_list = char_id[char]
– 不要使用数字作为 table 的索引,因为这是数组的地盘
if str_list then
return str_list[pos]
else
error('error:06:not exists char record')
end
end
– 获取数量字符表,并同时能获得对应的数量数据结构
local quantifier_char_table = {
['+'] = { 1, max_rep_times },
['*'] = { 0, max_rep_times },
['-'] = { 0, max_rep_times },
['?'] = { 0, 1},
}
– 获取其他的转义字符的字符列表。用于替换
local escape_id_table = {}
– class_char_table, char is class
local class_str = 'a c d l p s u w x z A C D L P S U W X Z'
local class_char_table = { ['.'] = '.' }
for id in string.gmatch(class_str, '%S') do
local id_char = apply_id_char(id)
class_char_table[id_char] = id
escape_id_table[id] = id_char
end
– 有些字符的转义需要隐藏,以便对不转义的字符进行的处理
local conceal_str = '+ - * ? { } % .'
local conceal_char_table = {}
for id in string.gmatch(conceal_str, '%S') do
local id_char = apply_id_char(id)
conceal_char_table[id_char] = id
escape_id_table[id] = id_char
end
– […] => class_char
– 将 user 自定义字符集结构进行字符化
local user_class_char = apply_id_char('[…]', 1)
– {num,num} {num} {num,} => quantifier_char
– 将自定义数量结构进行字符化
local user_quantifier_char = apply_id_char('{…}', 1)
– [0-8 a-z]
– 区间字符定义
local user_range_char = apply_id_char('[n-m]', 1)
function process_patt_str (patt)
– first and end $ is not any uses
patt = string.gsub(patt, '%^', '')
patt = string.gsub(patt, '%$$', '')
– replace all escape magic char to other char
– 对所有转义的字符进行处理,应用规则
– 用相应的字符代替
patt = string.gsub(patt, '%%(.)', function (id)
if escape_id_table[id] then
return escape_id_table[id]
end
return id
end)
– could not add user defined class to class char
patt = string.gsub(patt, '%[.-%]', function (str)
table.insert(char_id[user_class_char], str)
return user_class_char
end)
– also could not add user defined quantifier to quantifier char
patt = string.gsub(patt, '{%d+,?%d-}', function (str)
table.insert(char_id[user_quantifier_char], str)
return user_quantifier_char
end)
return patt
end
– 将 […] 格式的字符串解析成 数组,字符数组
local function parse_class_str (str)
– 预处理字符集字符串
– 将 .-. 结构的捕获出来
– 叫做 char_range
– 获取其中 a-z 0-9 A-Z 的结构
local range_char_table = {}
str = string.gsub(str, '(.)%-(.)', function (from, to)
local class = {}
local from_index = string.byte(from)
local to_index = string.byte(to)
for i = from_index, to_index do
table.insert(class, string.char(i))
end
table.insert(range_char_table, class)
return user_range_char
end)
– 其中的字符有 class 字符,除此以外,都是字符
– 只有两种模式
local mode = 0
local reverse_mode = 0
local range_pos = 1
– 用散列保存字符集,以去重
local char_table = {}
for char in string.gmatch(str, '.') do
if mode == 0 then
if char == '[' then
mode = 1
end
elseif mode == 1 then
mode = 2
if char == '^' then
reverse_mode = 1
elseif class_char_table[char] then
local class = get_char_class(char)
for _, k in ipairs(class) do
char_table[k] = 1
end
elseif char == user_range_char then
-- print('call here')
local class = range_char_table[range_pos]
range_pos = range_pos + 1
for _, k in ipairs(class) do
char_table[k] = 1
end
elseif conceal_char_table[char] then
local id = get_char_id(char)
char_table[id] = 1
elseif char == ']' then
error('error:07:without class define')
break
else
char_table[char] = 1
end
elseif mode == 2 then
if class_char_table[char] then
local class = get_char_class(char)
for _, k in ipairs(class) do
char_table[k] = 1
end
elseif char == user_range_char then
-- print('call here')
local class = range_char_table[range_pos]
range_pos = range_pos + 1
for _, k in ipairs(class) do
char_table[k] = 1
end
elseif conceal_char_table[char] then
local id = get_char_id(char)
char_table[id] = 1
elseif char == ']' then
break
else
char_table[char] = 1
end
end
end
– 将去重后的字符集合并成数组输出,可以进行排序
local array = {}
– 如果 class 前面定义有 ^ 标志
if reverse_mode == 1 then
for i = 0, 126 do
local char = string.char(i)
if not char_table[char] then
table.insert(array, char)
end
end
else
for k, _ in pairs(char_table) do
table.insert(array, k)
end
-- 如果没有的话,顺序就是乱的,需要排序
table.sort(array, function (a,b)
return a < b
end)
end
return array
end
– patt to array 解析 pattern to array
– { {char_list, amount}, {char_list. amount} }
local function patt_to_array (patt)
local array = {}
– return array
local class = {}
local mode = 0
– 定位在 class_char 的位置指针
local class_pos = 1
– 定位在 quantifier_char 的位置指针
local quantifier_pos = 1
– 如果末尾是 - 那么就连最后一个 class 一起删除
patt = string.gsub(patt, '-$', '?')
– 在末尾添加一个字符以结束解析
patt = patt .. '$'
for char in string.gmatch(patt, '.') do
-- 初始化模式 mode = 0
if mode == 0 then
-- 字符集字符,动态获取
if class_char_table[char] then
class = get_char_class(char)
mode = 1
-- 数量字符,进行映射
elseif quantifier_char_table[char] then
error('error:08:quantifier char could not at begin')
-- 转义字符,要进行恢复
-- 自定义字符集
elseif char == user_class_char then
local str = get_char_str(char, class_pos)
class = parse_class_str(str)
class_pos = class_pos + 1
mode = 1
-- 自定义数量字符
elseif char == user_quantifier_char then
error('error:09:user quantifier could not at begin')
elseif conceal_char_table[char] then
local id = get_char_id(char)
class = { id }
mode = 1
else
-- 其他字符处理
class = { char }
mode = 1
end
-- 如果是字符,或字符集模式 1
elseif mode == 1 then
-- 初始化 quantifier
local quantifier = {1, 1}
-- if it is quantifer identifer, then mode = 0
-- and get defined quantifer
if quantifier_char_table[char] then
quantifier = quantifier_char_table[char]
mode = 0
elseif char == user_quantifier_char then
local str = get_char_str(char, quantifier_pos)
quantifier = parse_quantifier_str(str)
quantifier_pos = quantifier_pos + 1
mode = 0
end
-- push current class and quantifier to array
table.insert(array, { class, quantifier })
-- if meet new class char, then create new class
if class_char_table[char] then
class = get_char_class(char)
elseif char == user_class_char then
local str = get_char_str(char, class_pos)
class = parse_class_str(str)
class_pos = class_pos + 1
elseif conceal_char_table[char] then
local id = get_char_id(char)
class = { id }
else
class = { char }
end
end
end
return array
end
function DataGenerator (str, mode)
mode = mode or 0
local patt = process_patt_str(str)
local array = patt_to_array(patt)
local data = cnq_to_array(array)
return concat_multi_array(data)
end
return DataGenerator