Updating transCSSR_bc to do faster estimation of joint word counts by…

… marginalizing.
3tz · Apr 30, 2018 · fc78668 · fc78668
1 parent 2dc00af
commit fc78668
Show file tree

Hide file tree

Showing 4 changed files with 399 additions and 12,664 deletions.
diff --git a/demo_transCSSR_bc.py b/demo_transCSSR_bc.py
@@ -11,6 +11,8 @@
 
 from transCSSR_bc import *
 
+import time
+
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 #
 # The various test transducers. Xt is the input
@@ -111,19 +113,23 @@
 
 alpha = 0.001
 
+counting_method = 1
+
 verbose = False
 
 # L is the maximum amount we want to ever look back.
 
-L_max = 5
+L_max = 10
 
 Tx = len(stringX); Ty = len(stringY)
 
 assert Tx == Ty, 'The two time series must have the same length.'
 
 T = Tx
 
-word_lookup_marg, word_lookup_fut = estimate_predictive_distributions(stringX, stringY, L_max)
+startTime = time.time()
+word_lookup_marg, word_lookup_fut = estimate_predictive_distributions(stringX, stringY, L_max, counting_method = counting_method, axs = axs, ays = ays)
+print ('The transCSSR counting took {0} seconds...'.format(time.time() - startTime))
 
 epsilon, invepsilon, morph_by_state = run_transCSSR(word_lookup_marg, word_lookup_fut, L_max, axs, ays, e_symbols, Xt_name, Yt_name, alpha = alpha, verbose = False)
 
@@ -133,7 +139,7 @@
 
 filtered_states, filtered_probs, stringY_pred = filter_and_predict(stringX, stringY, epsilon, invepsilon, morph_by_state, axs, ays, e_symbols, L_max)
 
-print 'Xt Yt \hat\{Y\}t St P(Yt = 1 | Xt, St)'
+# print 'Xt Yt \hat\{Y\}t St P(Yt = 1 | Xt, St)'
 
-for t_ind in range(int(numpy.min([100, len(stringX)]))):
-	print stringX[t_ind], stringY[t_ind], stringY_pred[t_ind], filtered_states[t_ind], filtered_probs[t_ind]
+# for t_ind in range(int(numpy.min([100, len(stringX)]))):
+# 	print stringX[t_ind], stringY[t_ind], stringY_pred[t_ind], filtered_states[t_ind], filtered_probs[t_ind]
diff --git a/transCSSR_bc.py b/transCSSR_bc.py
@@ -1077,7 +1077,7 @@ def get_transitions(epsilon, invepsilon, e_symbols, L_max, memoryless = False):
 
 
 
-def estimate_predictive_distributions(stringX, stringY, L_max, is_multiline = False, verbose = True):
+def estimate_predictive_distributions(stringX, stringY, L_max, counting_method = 0, axs = None, ays = None, is_multiline = False, verbose = False):
 	"""
 	Given a string of inputs and outputs,
 	returns the counts associated with
@@ -1105,6 +1105,25 @@ def estimate_predictive_distributions(stringX, stringY, L_max, is_multiline = Fa
 	L_max : int
 			The maximum history length to use in inferring the
 			predictive distributions.
+	counting_method : int
+			The method used to count the occurrences of joint
+			words of length 0 to L_max + 1. One of {0, 1}
+
+			0 : The joint words of length 0 to L_max + 1 are 
+			    counted as we parse the string.
+			1 : The joint words of length L_max + 1 are counted
+			    as we parse the string, and then the counts of
+			    joint sub-words are obtained by marginalizing
+			    the counts of the joint words.
+
+			0 is faster for larger L_max and shorter strings.
+			1 is faster for smaller L_max and longer strings.
+	axs : list
+			The emission symbols associated with X.
+			Only needed if counting_method == 1.
+	ays : list
+			The emission symbols associated with Y.
+			Only needed if counting_method == 1.
 	is_multiline : bool
 			True if the input files are stored with a single
 			realization per line.
@@ -1136,81 +1155,133 @@ def estimate_predictive_distributions(stringX, stringY, L_max, is_multiline = Fa
 
 	"""
 
-	if is_multiline:
-		Xs = copy.copy(stringX); Ys = copy.copy(stringY)
-
-		# Counter for events (X_{t-L}^{t-1}, Y_{t-L}^{t-1})
+	# Counter for events (X_{t-L}^{t-1}, Y_{t-L}^{t-1})
+
+	word_lookup_marg = Counter()
 
-		word_lookup_marg = Counter()
+	# Counter for events (X_{t-L}^{t-1}, Y_{t-L}^{t-1}, Y_{t})
 
-		# Counter for events (X_{t-L}^{t-1}, Y_{t-L}^{t-1}, Y_{t})
+	word_lookup_fut  = Counter()
 
-		word_lookup_fut  = Counter()
+	if is_multiline:
+		Xs = copy.copy(stringX); Ys = copy.copy(stringY)
 
 		if verbose:
 			print 'Estimating predictive distributions using multi-line.'
 
-		for line_ind in range(len(Xs)):
-			stringX = Xs[line_ind]; stringY = Ys[line_ind]
+		if counting_method == 0:
+			for line_ind in range(len(Xs)):
+				stringX = Xs[line_ind]; stringY = Ys[line_ind]
+
+				Tx = len(stringX)
+
+				Ty = len(stringY)
+
+				assert Tx == Ty, 'The two time series must have the same length.'
+
+				T = Tx
+
+				for t_ind in range(T-L_max):
+					cur_stringX = stringX[t_ind:(t_ind + L_max + 1)]
+
+					cur_stringY = stringY[t_ind:(t_ind + L_max + 1)]
+
+					word_lookup_marg[(cur_stringX, cur_stringY[:-1])] += 1
+					word_lookup_fut[(cur_stringX, cur_stringY)] += 1
+
+					for remove_inds in range(1, L_max+1):
+						trunc_stringX = cur_stringX[:-remove_inds]
+						trunc_stringY = cur_stringY[:-remove_inds]
 
-			Tx = len(stringX)
+						word_lookup_marg[(trunc_stringX, trunc_stringY[:-1])] += 1
+						word_lookup_fut[(trunc_stringX, trunc_stringY)] += 1
+		elif counting_method == 1:
+			for line_ind in range(len(Xs)):
+				stringX = Xs[line_ind]; stringY = Ys[line_ind]
+
+				Tx = len(stringX)
+
+				Ty = len(stringY)
+
+				assert Tx == Ty, 'The two time series must have the same length.'
+
+				T = Tx
+
+				for t_ind in range(T-L_max):
+					word_lookup_fut[stringX[t_ind:t_ind+L_max+1], stringY[t_ind:t_ind+L_max+1]] += 1
+	else:
+		Tx = len(stringX)
 
-			Ty = len(stringY)
+		Ty = len(stringY)
 
-			assert Tx == Ty, 'The two time series must have the same length.'
+		assert Tx == Ty, 'The two time series must have the same length.'
 
-			T = Tx
+		T = Tx
+
+		if verbose:
+			print 'Estimating predictive distributions.'
 
+		if counting_method == 0:
 			for t_ind in range(T-L_max):
 				cur_stringX = stringX[t_ind:(t_ind + L_max + 1)]
-
+		
 				cur_stringY = stringY[t_ind:(t_ind + L_max + 1)]
-
+		
 				word_lookup_marg[(cur_stringX, cur_stringY[:-1])] += 1
 				word_lookup_fut[(cur_stringX, cur_stringY)] += 1
-
-				# for remove_inds in range(0, L_max+1): DON'T NEED THIS
+
 				for remove_inds in range(1, L_max+1):
 					trunc_stringX = cur_stringX[:-remove_inds]
 					trunc_stringY = cur_stringY[:-remove_inds]
-
+			
 					word_lookup_marg[(trunc_stringX, trunc_stringY[:-1])] += 1
 					word_lookup_fut[(trunc_stringX, trunc_stringY)] += 1
-	else:
-		Tx = len(stringX)
-
-		Ty = len(stringY)
-
-		assert Tx == Ty, 'The two time series must have the same length.'
-
-		T = Tx
-
-		# Counter for events (X_{t-L}^{t-1}, Y_{t-L}^{t-1})
+		elif counting_method == 1:
+			for t_ind in range(T-L_max):
+				word_lookup_fut[stringX[t_ind:t_ind+L_max+1], stringY[t_ind:t_ind+L_max+1]] += 1
 
-		word_lookup_marg = Counter()
+	if counting_method == 1:
+		assert axs is not None and ays is not None, "Please provide the alphabets for the input (axs) and output (ays)."
 
-		# Counter for events (X_{t-L}^{t-1}, Y_{t-L}^{t-1}, Y_{t})
 
-		word_lookup_fut  = Counter()
-
-		if verbose:
-			print 'Estimating predictive distributions.'
+		seen_subword = {}
 
-		for t_ind in range(T-L_max):
-			cur_stringX = stringX[t_ind:(t_ind + L_max + 1)]
-
-			cur_stringY = stringY[t_ind:(t_ind + L_max + 1)]
-
-			word_lookup_marg[(cur_stringX, cur_stringY[:-1])] += 1
-			word_lookup_fut[(cur_stringX, cur_stringY)] += 1
-
-			# for remove_inds in range(0, L_max+1): # DON'T NEED THIS
-			for remove_inds in range(1, L_max+1):
-				trunc_stringX = cur_stringX[:-remove_inds]
-				trunc_stringY = cur_stringY[:-remove_inds]
-
-				word_lookup_marg[(trunc_stringX, trunc_stringY[:-1])] += 1
-				word_lookup_fut[(trunc_stringX, trunc_stringY)] += 1
+		histories_by_L = [word_lookup_fut.keys()]
+
+		for L_cur in range(L_max, -1, -1):
+			histories_by_L.append([])
+
+			for wordX, wordY in histories_by_L[-2]:
+				subwordX = wordX[:L_cur]
+				subwordY = wordY[:L_cur]
+
+				if seen_subword.get((subwordX, subwordY), False):
+					pass
+				else:
+					histories_by_L[-1].append((subwordX, subwordY))
+
+					seen_subword[subwordX, subwordY] = True
+
+					c_xy = 0
+
+					for ax in axs:
+						c_x  = 0
+
+						for ay in ays:
+							c_xy += word_lookup_fut.get((subwordX + ax, subwordY + ay), 0)
+							c_x  += word_lookup_fut.get((subwordX + ax, subwordY + ay), 0)
+
+						if c_x == 0:
+							pass
+						else:
+							word_lookup_marg[subwordX + ax, subwordY] = c_x
+
+					if c_xy == 0:
+						pass
+					else:
+						word_lookup_fut[subwordX, subwordY] = c_xy
+
+		del word_lookup_fut['', '']
 
 	return word_lookup_marg, word_lookup_fut
 def run_transCSSR(word_lookup_marg, word_lookup_fut, L_max, axs, ays, e_symbols, Xt_name, Yt_name, alpha = 0.001, test_type = 'chi2', fname = None, verbose = False, all_digits = False):