Implement hash_n_degree_quads, fix references to Predicate that should have been Object

This commit is contained in:
asonix 2022-12-10 21:39:50 -06:00
parent b4bd3d6781
commit 8432bb542d
2 changed files with 377 additions and 38 deletions

View file

@ -9,6 +9,7 @@ edition = "2021"
contextual = "0.1.3"
indexmap = "1.9.2"
iref = "2.2.0"
itertools = "0.10.5"
json-ld = "0.9.1"
locspan = "0.7.9"
rdf-types = "0.12.4"

View file

@ -1,23 +1,25 @@
use contextual::WithContext;
use indexmap::IndexMap;
use iref::IriBuf;
use json_ld::{ExpandedDocument, ValidId as Subject};
use itertools::Itertools;
use json_ld::{rdf::Value, ExpandedDocument, ValidId as Subject};
use locspan::{Location, Meta, Span};
use rdf_types::{
generator::{Blank, WithMetadata},
BlankIdVocabulary, BlankIdVocabularyMut, IriVocabulary, Vocabulary, VocabularyMut,
};
use std::{
borrow::Borrow,
borrow::{Borrow, Cow},
collections::{BTreeMap, HashMap, HashSet},
hash::Hash,
};
mod input_dataset;
use input_dataset::{NormalizingQuad, Position, QuadSubject};
use input_dataset::{InputDataset, NormalizingQuad, Position, QuadSubject, QuadValue};
pub use input_dataset::InputDataset;
#[derive(Clone, Debug)]
pub struct Security;
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct HexHash(pub String);
@ -63,12 +65,31 @@ where
hash_to_blank_nodes: BTreeMap<HexHash, HashSet<N::BlankId>>,
}
pub fn normalize<N, S>(
vocabulary: N,
document_id: IriBuf,
expanded: Expanded<N>,
bail_on_large_inputs: bool,
) -> Result<OutputDataset<N>, Security>
where
S: Sha256 + Default,
N: Vocabulary + VocabularyMut + Default,
N::Iri: Clone + Eq + Hash + Send + Sync,
N::BlankId: Clone + Eq + Hash + Send + Sync + for<'a> Borrow<&'a N::BlankId>,
{
CanonicalizationState::<N, S>::new(vocabulary).normalize(
document_id,
expanded,
bail_on_large_inputs,
)
}
impl<N, S> CanonicalizationState<N, S>
where
N: Vocabulary,
{
/// Step 1
pub fn new(vocabulary: N) -> Self
fn new(vocabulary: N) -> Self
where
S: Default,
{
@ -82,7 +103,12 @@ where
}
}
pub fn normalize(&mut self, document_id: IriBuf, expanded: Expanded<N>) -> OutputDataset<N>
fn normalize(
mut self,
document_id: IriBuf,
expanded: Expanded<N>,
bail_on_large_inputs: bool,
) -> Result<OutputDataset<N>, Security>
where
S: Sha256,
N: VocabularyMut + Default,
@ -98,10 +124,10 @@ where
self.issue_simple_canonical_identifiers(&input_dataset);
// Step 6
self.issue_complex_canonical_identifiers();
self.issue_complex_canonical_identifiers(bail_on_large_inputs, &input_dataset)?;
// Step 7
self.normalize_quads(&input_dataset)
Ok(self.normalize_quads(&input_dataset))
}
// (preparing input dataset is not a step, but we're coming from json ld types here)
@ -123,13 +149,19 @@ where
{
for (position, quad) in input_dataset.quads() {
// step 2.1
for field in [Some(quad.subject()), Some(quad.predicate()), quad.graph()] {
if let Some(Subject::Blank(ref blank_id)) = field {
self.blank_node_to_quads
.entry(blank_id.clone())
.or_default()
.insert(position);
}
let iter = [
subject_as_blank_id::<N>(quad.subject()),
object_as_blank_id::<N>(quad.object()),
quad.graph().and_then(subject_as_blank_id::<N>),
]
.into_iter()
.filter_map(|opt| opt);
for blank_id in iter {
self.blank_node_to_quads
.entry(blank_id.clone())
.or_default()
.insert(position);
}
}
}
@ -206,10 +238,15 @@ where
}
// Step 6
fn issue_complex_canonical_identifiers(&mut self)
fn issue_complex_canonical_identifiers(
&mut self,
bail_on_large_inputs: bool,
input_dataset: &InputDataset<N>,
) -> Result<(), Security>
where
N: Default + BlankIdVocabularyMut + VocabularyMut,
N::BlankId: Clone + Eq + Hash,
N::BlankId: Clone + Eq + Hash + for<'a> Borrow<&'a N::BlankId>,
S: Sha256,
{
let hash_to_blank_nodes =
std::mem::replace(&mut self.hash_to_blank_nodes, Default::default());
@ -225,6 +262,10 @@ where
continue;
}
if bail_on_large_inputs {
return Err(Security);
}
// step 6.2.2
let mut temporary_issuer = make_issuer("_:b");
@ -240,11 +281,16 @@ where
);
// step 6.2.4
let hash = self.hash_n_degree_quads(
identifier,
&mut temporary_issuer,
let hash = hash_n_degree_quads(
&self.blank_node_to_quads,
&mut temporary_vocabulary,
&self.vocabulary,
&mut temporary_issuer,
&mut issued_identifier_list,
&self.issued_identifier_list,
identifier,
input_dataset,
&mut self.sha256,
);
hash_path_list.push((hash, issued_identifier_list));
@ -263,6 +309,8 @@ where
}
}
}
Ok(())
}
// Step 7
@ -277,9 +325,9 @@ where
// step 7.1
let subject = self.translate_subject(quad.subject())?;
let predicate = self.translate_subject(quad.predicate())?;
let predicate = quad.predicate().clone();
let object = quad.object().clone();
let object = self.translate_object(quad.object())?;
let graph = if let Some(graph) = quad.graph() {
Some(self.translate_subject(graph)?)
@ -294,6 +342,17 @@ where
OutputDataset { quads }
}
fn translate_object(&self, object: &QuadValue<N>) -> Option<QuadValue<N>>
where
N::BlankId: Eq + Hash + Clone + for<'a> Borrow<&'a N::BlankId>,
N::Iri: Clone,
{
match object {
Value::Reference(subject) => Some(Value::Reference(self.translate_subject(subject)?)),
Value::Literal(literal) => Some(Value::Literal(literal.clone())),
}
}
fn translate_subject(&self, subject: &QuadSubject<N>) -> Option<QuadSubject<N>>
where
N::BlankId: Eq + Hash + Clone + for<'a> Borrow<&'a N::BlankId>,
@ -306,16 +365,248 @@ where
)),
}
}
}
fn hash_n_degree_quads(
&self,
identifier: N::BlankId,
issuer: &mut Blank,
vocabulary: &mut N,
issued_identifier_list: &mut IndexMap<N::BlankId, N::BlankId>,
) -> HexHash {
todo!()
fn hash_n_degree_quads<N, S>(
blank_node_to_quads: &HashMap<N::BlankId, HashSet<Position>>,
vocabulary: &mut N,
canon_vocabulary: &N,
issuer: &mut Blank,
issued_identifier_list: &mut IndexMap<N::BlankId, N::BlankId>,
canon_issued_identifier_list: &IndexMap<N::BlankId, N::BlankId>,
identifier: N::BlankId,
input_dataset: &InputDataset<N>,
sha256: &mut S,
) -> HexHash
where
N: Vocabulary + VocabularyMut,
N::BlankId: Clone + Eq + Hash + for<'a> Borrow<&'a N::BlankId>,
S: Sha256,
{
// step 1
let mut hash_to_related_blank_nodes: HashMap<HexHash, HashSet<N::BlankId>> = HashMap::new();
// step 2
if let Some(quad_positions) = blank_node_to_quads.get(&&identifier) {
// step 3
for quad_position in quad_positions {
let quad = input_dataset
.get(*quad_position)
.expect("Positions are created from the input dataset");
// step 3.1
let iter = [
("s", subject_as_blank_id::<N>(quad.subject())),
("o", object_as_blank_id::<N>(quad.object())),
("g", quad.graph().and_then(subject_as_blank_id::<N>)),
]
.into_iter()
.filter_map(|(position, opt)| Some((position, opt?)))
.filter(|(_, blank_id)| identifier != **blank_id);
for (position, related) in iter {
// step 3.1.1
let hash = hash_related_blank_node(
blank_node_to_quads,
canon_issued_identifier_list,
canon_vocabulary,
issued_identifier_list,
vocabulary,
related,
quad,
position,
input_dataset,
sha256,
);
// step 3.1.2
hash_to_related_blank_nodes
.entry(hash)
.or_default()
.insert(related.clone());
}
}
}
// step 4
let mut data_to_hash = String::new();
// step 5
for (related_hash, blank_node_list) in hash_to_related_blank_nodes {
// step 5.1
data_to_hash += &related_hash.0;
// step 5.2
let mut chosen_path = String::new();
// step 5.3
let mut chosen_issuer = Default::default();
let mut chosen_issued_identifier_list = Default::default();
'permute: for permutation in permute(blank_node_list) {
// step 5.4.1
let mut issuer_copy = Blank::new_full(issuer.prefix().to_string(), issuer.count());
let mut issued_identifier_list_copy = issued_identifier_list.clone();
// step 5.4.2
let mut path = String::new();
// step 5.4.3
let mut recursion_list = HashSet::new();
// step 5.4.4
for related in permutation {
if let Some(blank) = canon_issued_identifier_list.get(&related) {
// step 5.4.4.1
if let Some(blank_id) = canon_vocabulary.blank_id(blank) {
path += &blank_id.to_string();
} else {
eprintln!("No blank in vocabulary");
}
} else {
// step 5.4.4.2
// step 5.4.4.2.1
recursion_list.insert(related.clone());
// step 5.4.4.2.2
issue_identifier_algorithm(
related,
&mut issuer_copy,
vocabulary,
&mut issued_identifier_list_copy,
);
}
// step 5.4.4.3
if !chosen_path.is_empty() && path.len() >= chosen_path.len() && path > chosen_path
{
continue 'permute;
}
}
// step 5.4.5
for related in recursion_list {
// step 5.4.5.1
let result = hash_n_degree_quads(
blank_node_to_quads,
vocabulary,
canon_vocabulary,
&mut issuer_copy,
&mut issued_identifier_list_copy,
canon_issued_identifier_list,
related.clone(),
input_dataset,
sha256,
);
// step 5.4.5.2
let new_blank = issue_identifier_algorithm(
related,
&mut issuer_copy,
vocabulary,
&mut issued_identifier_list_copy,
);
if let Some(blank_id) = vocabulary.blank_id(&new_blank) {
path += &blank_id.to_string();
// step 5.4.5.3
path += "<";
path += result.0.as_str();
path += ">";
} else {
eprintln!("No blank in vocabulary");
}
// step 5.4.5.4 is a no-op
// step 5.4.5.5
if !chosen_path.is_empty() && path.len() >= chosen_path.len() && path > chosen_path
{
continue 'permute;
}
}
if chosen_path.is_empty() || path < chosen_path {
chosen_path = path;
chosen_issuer = issuer_copy;
chosen_issued_identifier_list = issued_identifier_list_copy;
}
}
// step 5.5
data_to_hash += &chosen_path;
// step 5.6
std::mem::swap(issuer, &mut chosen_issuer);
std::mem::swap(issued_identifier_list, &mut chosen_issued_identifier_list);
}
// step 6
sha256.update(data_to_hash.as_bytes());
sha256.finalize_hex_and_reset()
}
fn permute<B>(set: HashSet<B>) -> impl Iterator<Item = Vec<B>>
where
B: Hash + Eq + Clone,
{
let len = set.len();
set.into_iter().permutations(len)
}
fn hash_related_blank_node<N, S>(
blank_node_to_quads: &HashMap<N::BlankId, HashSet<Position>>,
canon_issued_identifier_list: &IndexMap<N::BlankId, N::BlankId>,
canon_vocabulary: &N,
issued_identifier_list: &IndexMap<N::BlankId, N::BlankId>,
vocabulary: &N,
related: &N::BlankId,
quad: &NormalizingQuad<N>,
position: &str,
input_dataset: &InputDataset<N>,
sha256: &mut S,
) -> HexHash
where
N: Vocabulary,
N::BlankId: Clone + Eq + Hash + for<'a> Borrow<&'a N::BlankId>,
S: Sha256,
{
// step 1
let identifier = if let Some(blank_id) = canon_issued_identifier_list.get(related) {
let blank = canon_vocabulary
.blank_id(blank_id)
.expect("No blank in vocabulary");
blank.to_string()
} else if let Some(blank_id) = issued_identifier_list.get(related) {
let blank = vocabulary
.blank_id(blank_id)
.expect("No blank in vocabulary");
blank.to_string()
} else {
hash_first_degree_quads(
blank_node_to_quads,
canon_vocabulary,
related.clone(),
input_dataset,
sha256,
)
.0
};
// step 2
let mut input = String::from(position);
// step 3
if position != "g" {
input += "<";
input += &quad.predicate().with(canon_vocabulary).to_string();
input += ">";
}
// step 4
input += &identifier;
// step 5
sha256.update(input.as_bytes());
sha256.finalize_hex_and_reset()
}
fn hash_first_degree_quads<N, S>(
@ -334,9 +625,9 @@ where
let mut nquads = Vec::new();
// step 2
if let Some(quads) = blank_node_to_quads.get(&identifier) {
if let Some(quad_positions) = blank_node_to_quads.get(&identifier) {
// step 3
for quad_position in quads {
for quad_position in quad_positions {
let quad = input_dataset
.get(*quad_position)
.expect("Positions are created from the input dataset");
@ -365,8 +656,8 @@ where
N::BlankId: Clone + Eq,
{
let subject = serialize_subject(identifier, quad.subject(), vocabulary);
let predicate = serialize_subject(identifier, quad.predicate(), vocabulary);
let object = quad.object().with(vocabulary);
let predicate = quad.predicate().with(vocabulary);
let object = serialize_object(identifier, quad.object(), vocabulary);
let graph = quad
.graph()
.map(|graph| serialize_subject(identifier, graph, vocabulary));
@ -378,17 +669,36 @@ where
}
}
fn serialize_subject<N>(identifier: &N::BlankId, subject: &QuadSubject<N>, vocabulary: &N) -> String
fn serialize_subject<N>(
identifier: &N::BlankId,
subject: &QuadSubject<N>,
vocabulary: &N,
) -> Cow<'static, str>
where
N: Vocabulary,
N::BlankId: Eq,
{
if subject.is_blank() && matches_identifier::<N>(identifier, subject) {
String::from("_:a")
Cow::Borrowed("_:a")
} else if subject.is_blank() {
String::from("_:z")
Cow::Borrowed("_:z")
} else {
format!("{}", subject.with(vocabulary))
Cow::Owned(subject.with(vocabulary).to_string())
}
}
fn serialize_object<N>(
identifier: &N::BlankId,
object: &QuadValue<N>,
vocabulary: &N,
) -> Cow<'static, str>
where
N: Vocabulary,
N::BlankId: Eq,
{
match object {
Value::Literal(lit) => Cow::Owned(lit.with(vocabulary).to_string()),
Value::Reference(subject) => serialize_subject(identifier, subject, vocabulary),
}
}
@ -403,6 +713,26 @@ where
}
}
fn subject_as_blank_id<N>(subject: &QuadSubject<N>) -> Option<&N::BlankId>
where
N: Vocabulary,
{
match subject {
Subject::Blank(ref blank) => Some(blank),
_ => None,
}
}
fn object_as_blank_id<N>(object: &QuadValue<N>) -> Option<&N::BlankId>
where
N: Vocabulary,
{
match object {
Value::Reference(Subject::Blank(ref blank)) => Some(blank),
_ => None,
}
}
fn issue_identifier_algorithm<N>(
identifier: N::BlankId,
generator: &mut Blank,
@ -440,3 +770,11 @@ fn canonicalization_node_generator() -> Blank {
fn make_issuer(prefix: &str) -> Blank {
Blank::new_with_prefix(String::from(prefix))
}
impl std::fmt::Display for Security {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Aborted due to time complexity")
}
}
impl std::error::Error for Security {}