import pandas as pd
import numpy as np
import string
import csv
import sympy
import esr.generation.generator as generator
[docs]
def split_by_punctuation(s):
"""
Split a string s into a list, where each instance of punctuation or a space causes a split.
E.g. the string s = 'Hello, how are you?' becomes ['Hello', ',', ' ', 'how', ' ', 'are', ' ', 'you', '?']
Args:
:s (str): String we wish to split
Returns:
:split_str (list): List of strings split by punctuation
"""
pun = string.punctuation.replace('_', '') # allow underscores in variable names
pun = pun + ' '
where_pun = [i for i in range(len(s)) if s[i] in pun]
split_str = [s[:where_pun[0]]]
for i in range(len(where_pun)-1):
split_str += [s[where_pun[i]]]
split_str += [s[where_pun[i]+1:where_pun[i+1]]]
split_str += [s[where_pun[-1]]]
if where_pun[-1] != len(s) - 1:
split_str += [s[where_pun[-1]+1:]]
return split_str
[docs]
def standardise_file(in_name, out_name, input_delimiter):
"""
Standardise the input equations used so that variables are named x0, x1, ..., x9
Args:
:in_name (str): Name of file containing the equations to study. If None, then equations read from out_eqfile
:out_name (str): Name of file to output the new equations to
:input_delimiter (str): The delimiter used in the input csv file
Returns:
:all_eq (list): List of equations as strings with the standardised variable names
:max_var (int): The maximum number of variables appearing in any of the equations
"""
if in_name is None:
df = pd.read_csv(out_name, delimiter=input_delimiter)
all_eq = df['New Formula'].tolist()
maxvar = 0
for eq in all_eq:
f = split_by_punctuation(eq)
vars = [int(s[1:])+1 for s in f if s.startswith('x') and s[1:].isdigit()]
maxvar = max(maxvar, len(vars))
else:
df = pd.read_csv(in_name, delimiter=input_delimiter)
maxvar = int(df['# variables'].max()) #+ 1
all_eq = []
with open(out_name, 'w') as f:
csvwriter = csv.writer(f)
csvwriter.writerow(['Filename', 'Number', 'Old Formula', 'New Formula'])
for index, row in df.iterrows():
if not np.isfinite(row['# variables']):
continue
eq = row['Formula'].replace(" ", "")
# If equation already has variable 'x{i}" then we don't need to replace it
# Note: At least Eqs I.18.12, I.18.14 and II.37.1 in AIFeynman has wrong number of vars
# The next two lines fix this
vars = [row[f'v{i+1}_name'] for i in range(maxvar)]
vars = [v.replace(" ", "") for v in vars if isinstance(v, str)]
names = [f'x{i}' for i in range(len(vars))]
to_change = list(sorted(set(vars) - set(names), key=vars.index))
to_sub = list(sorted(set(names) - (set(vars) - set(to_change)), key=names.index))
# Must split by punctuation to avoid replacing e.g. "t" in "sqrt" if we have a variable "t"
sub_dict = dict(zip(to_change,to_sub))
split_eq = split_by_punctuation(eq)
for i, s in enumerate(split_eq):
if s in to_change:
split_eq[i] = sub_dict[s]
eq = ''.join(split_eq)
all_eq.append(eq)
csvwriter.writerow([row['Filename'], row['Number'], row['Formula'], eq])
return all_eq, maxvar
[docs]
class SymbolCoder:
def __init__(self, basis_functions):
"""Class to encode equations as tuples of strings to be used by a back-off model
Args:
:basis_functions (list): List of basis functions to consider. Entries 0, 1 and 2 are lists of nullary, unary, and binary operators, respectively.
Returns:
SymbolCoder: A coder to encode equations
"""
self.basis_functions = basis_functions
self.sympy_numerics = ['Number', 'Float', 'Rational', 'Integer', 'AlgebraicNumber',
'NumberSymbol', 'RealNumber', 'igcd', 'ilcm', 'seterr', 'Zero',
'One', 'NegativeOne', 'Half', 'NaN', 'Infinity', 'NegativeInfinity',
'ComplexInfinity', 'Exp1', 'ImaginaryUnit', 'Pi', 'EulerGamma',
'Catalan', 'GoldenRatio', 'TribonacciConstant', 'mod_inverse']
self.sympy_numerics = [s.lower() for s in self.sympy_numerics]
self.ops = [str(None), 'a', 'x', 'y'] + basis_functions[1] + basis_functions[2]
self.code = self.ops
self.code = dict(zip(self.code, np.arange(len(self.code)).astype(str)))
self.ignore_ops = ['Abs', 're', 'im'] # do not attempt to find probability of these operators
[docs]
def nodes2ntuples(self, n, nodes):
"""
Convert a node object giving the tree representation of an equation into
n-tuples describing the tree structure of the function
Args:
:n (int): The length of the n-tuples to produce
:nodes (esr.generation.generator.DecoratedNode): Node object corresponding to the tree
Returns:
:ntuples (list); List of n-tuples which describe tree structure of function
"""
lin, val = nodes.get_sibling_lineage()
ntuples = []
for t,v in zip(lin, val):
# Check for ignored operators
idx = [i for i,tt in enumerate(t) if tt not in self.ignore_ops]
tnew = [t[i] for i in idx]
vnew = [v[i] for i in idx]
if tnew[-1][0] in self.ignore_ops or (len(tnew[-1]) > 1 and tnew[-1][1] in self.ignore_ops):
continue
# Get codeword of ancestors
if len(tnew) >= n:
x = tnew[-n:-1]
else:
x = tuple([None]*(n-len(tnew)) + list(tnew[:-1]))
nt = [self.op2codeword(tt) for tt in x]
# Deal with sibling at end of tree
if isinstance(t[-1], tuple):
sib = [self.op2str(tt) for tt in t[-1]]
else:
sib = [self.op2str(tt) for tt in (t[-1], None)]
if sib[0] == 'x' and sib[1] == 'x' and (vnew[-1][0] != vnew[-1][1]):
sib[1] = 'y'
tup = list(nt + [self.code[s] for s in sib])
# Remove "None" at start of lineage
idx = [i for i in range(len(tup)) if tup[i] != self.code['None']]
tup = tuple(tup[idx[0]:])
ntuples.append(tup)
# The parent node will not have been considered
ntuples = [tuple([ntuples[0][0]])] + ntuples
return ntuples
[docs]
def labels2ntuples(self, n, labels):
"""
Convert a list of labels giving the tree representation of an equation into
n-tuples describing the tree structure of the function
Args:
:n (int): The length of the n-tuples to produce
:labels (list): The list giving the equation to convert to an n-tuple
Returns:
:ntuples (list); List of n-tuples which describe tree structure of function
"""
labels_changed = labels.copy()
for i, lab in enumerate(labels):
if lab.lower() in self.sympy_numerics or generator.is_float(lab):
labels_changed[i] = 'a'
# Get parent operators
s = generator.labels_to_shape(labels_changed, self.basis_functions)
success, _, tree = generator.check_tree(s)
assert success
for i, lab in enumerate(labels_changed):
tree[i].assign_op(lab)
nodes = generator.DecoratedNode(None, self.basis_functions)
nodes.from_node_list(0, tree, self.basis_functions)
ntuples = self.nodes2ntuples(n, nodes)
return ntuples
[docs]
def equation2ntuples(self, n, eq, locs):
"""
Convert an equation into n-tuples describing the tree structure of the function
Args:
:n (int): The length of the n-tuples to produce
:eq (str): The equation to convert to an n-tuple
:locs (dict): dictionary of string:sympy objects describing variables
Returns:
:ntuples (list); List of n-tuples which describe tree structure of function
"""
expr, nodes, c = generator.string_to_node(eq, self.basis_functions, locs=locs, evalf=True, allow_eval=False, check_ops=True)
# Remove any operators we want to ignore ('Abs')
redo = False
for op in self.ignore_ops:
if op in eq:
redo = True
if redo:
s = str(expr)
s = split_by_punctuation(s)
s = [ss for ss in s if ss not in self.ignore_ops]
s = ''.join(s)
expr, nodes, c = generator.string_to_node(s, self.basis_functions, locs=locs, evalf=True, allow_eval=False, check_ops=True)
labels = nodes.to_list(self.basis_functions)
for i in range(len(labels)):
if labels[i] == 'Add' and '+' in self.basis_functions[2]:
labels[i] = '+'
elif labels[i] == 'Sub' and '-' in self.basis_functions[2]:
labels[i] = '-'
elif labels[i] == 'Mul' and '*' in self.basis_functions[2]:
labels[i] = '*'
elif labels[i] == 'Div' and '/' in self.basis_functions[2]:
labels[i] = '/'
elif labels[i].lower() in self.sympy_numerics or generator.is_float(labels[i] ):
labels[i] = 'a'
else:
labels[i] = labels[i].lower()
ntuples = self.labels2ntuples(n, labels)
return ntuples
[docs]
def process_all_equations(self, n, all_eq, maxvar):
"""
Turn all equations into n-tuples describing the tree structures of their functions
Args:
:n (int): The length of the n-tuples to produce
:all_eq (list): List of equations as strings to convert to n-tuples
:maxvar (int): The maximum number of variables appearing in any of the equations
Returns:
:ntuples (list): List of n-tuples which describe tree structures of the functions
"""
x = sympy.symbols([f'x{i}' for i in range(maxvar)], real=True)
a = sympy.symbols([f'a{i}' for i in range(maxvar)], real=True)
d1 = {f'x{i}':x[i] for i in range(len(x))}
d2 = {f'a{i}':a[i] for i in range(len(x))}
locs = {**d1, **d2}
ntuples = []
for eq in all_eq:
if isinstance(eq, str):
ntuples += self.equation2ntuples(n, eq, locs)
else:
ntuples += self.labels2ntuples(n, eq)
return ntuples
[docs]
def op2str(self, op):
"""
Convert operator names defined by sympy into symbols used here
Args:
:op (str): Operator name of sympy class
Returns:
str: The equivalent symbol used here
"""
if op is None:
return str(None)
elif op.lower() in self.sympy_numerics or generator.is_float(op):
return 'a'
elif op == 'Symbol':
return 'x'
elif (op.startswith('x') or op.startswith('a')) and (op[1:].isdigit() or len(op) == 1):
return 'x'
elif op == 'Add' and '+' in self.basis_functions[2]:
return '+'
elif op == 'Sub' and '-' in self.basis_functions[2]:
return '-'
elif op == 'Mul' and '*' in self.basis_functions[2]:
return '*'
elif op == 'Div' and '/' in self.basis_functions[2]:
return '/'
elif (op.lower() in self.basis_functions[1]) or (op.lower() in self.basis_functions[2]):
return op.lower()
raise Exception("Unknown operator type:" + op)
[docs]
def op2codeword(self, op):
"""
Convert an operator name as defined by sympy into the codeword assigned to it
Args:
:op (str): Operator name of sympy class
Returns:
str: The codeword used to represent this symbol
"""
return self.code[self.op2str(op)]