Implement hash_n_degree_quads, fix references to Predicate that should have been Object
This commit is contained in:
parent
b4bd3d6781
commit
8432bb542d
|
@ -9,6 +9,7 @@ edition = "2021"
|
|||
contextual = "0.1.3"
|
||||
indexmap = "1.9.2"
|
||||
iref = "2.2.0"
|
||||
itertools = "0.10.5"
|
||||
json-ld = "0.9.1"
|
||||
locspan = "0.7.9"
|
||||
rdf-types = "0.12.4"
|
||||
|
|
414
src/lib.rs
414
src/lib.rs
|
@ -1,23 +1,25 @@
|
|||
use contextual::WithContext;
|
||||
use indexmap::IndexMap;
|
||||
use iref::IriBuf;
|
||||
use json_ld::{ExpandedDocument, ValidId as Subject};
|
||||
use itertools::Itertools;
|
||||
use json_ld::{rdf::Value, ExpandedDocument, ValidId as Subject};
|
||||
use locspan::{Location, Meta, Span};
|
||||
use rdf_types::{
|
||||
generator::{Blank, WithMetadata},
|
||||
BlankIdVocabulary, BlankIdVocabularyMut, IriVocabulary, Vocabulary, VocabularyMut,
|
||||
};
|
||||
use std::{
|
||||
borrow::Borrow,
|
||||
borrow::{Borrow, Cow},
|
||||
collections::{BTreeMap, HashMap, HashSet},
|
||||
hash::Hash,
|
||||
};
|
||||
|
||||
mod input_dataset;
|
||||
|
||||
use input_dataset::{NormalizingQuad, Position, QuadSubject};
|
||||
use input_dataset::{InputDataset, NormalizingQuad, Position, QuadSubject, QuadValue};
|
||||
|
||||
pub use input_dataset::InputDataset;
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Security;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct HexHash(pub String);
|
||||
|
@ -63,12 +65,31 @@ where
|
|||
hash_to_blank_nodes: BTreeMap<HexHash, HashSet<N::BlankId>>,
|
||||
}
|
||||
|
||||
pub fn normalize<N, S>(
|
||||
vocabulary: N,
|
||||
document_id: IriBuf,
|
||||
expanded: Expanded<N>,
|
||||
bail_on_large_inputs: bool,
|
||||
) -> Result<OutputDataset<N>, Security>
|
||||
where
|
||||
S: Sha256 + Default,
|
||||
N: Vocabulary + VocabularyMut + Default,
|
||||
N::Iri: Clone + Eq + Hash + Send + Sync,
|
||||
N::BlankId: Clone + Eq + Hash + Send + Sync + for<'a> Borrow<&'a N::BlankId>,
|
||||
{
|
||||
CanonicalizationState::<N, S>::new(vocabulary).normalize(
|
||||
document_id,
|
||||
expanded,
|
||||
bail_on_large_inputs,
|
||||
)
|
||||
}
|
||||
|
||||
impl<N, S> CanonicalizationState<N, S>
|
||||
where
|
||||
N: Vocabulary,
|
||||
{
|
||||
/// Step 1
|
||||
pub fn new(vocabulary: N) -> Self
|
||||
fn new(vocabulary: N) -> Self
|
||||
where
|
||||
S: Default,
|
||||
{
|
||||
|
@ -82,7 +103,12 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
pub fn normalize(&mut self, document_id: IriBuf, expanded: Expanded<N>) -> OutputDataset<N>
|
||||
fn normalize(
|
||||
mut self,
|
||||
document_id: IriBuf,
|
||||
expanded: Expanded<N>,
|
||||
bail_on_large_inputs: bool,
|
||||
) -> Result<OutputDataset<N>, Security>
|
||||
where
|
||||
S: Sha256,
|
||||
N: VocabularyMut + Default,
|
||||
|
@ -98,10 +124,10 @@ where
|
|||
self.issue_simple_canonical_identifiers(&input_dataset);
|
||||
|
||||
// Step 6
|
||||
self.issue_complex_canonical_identifiers();
|
||||
self.issue_complex_canonical_identifiers(bail_on_large_inputs, &input_dataset)?;
|
||||
|
||||
// Step 7
|
||||
self.normalize_quads(&input_dataset)
|
||||
Ok(self.normalize_quads(&input_dataset))
|
||||
}
|
||||
|
||||
// (preparing input dataset is not a step, but we're coming from json ld types here)
|
||||
|
@ -123,13 +149,19 @@ where
|
|||
{
|
||||
for (position, quad) in input_dataset.quads() {
|
||||
// step 2.1
|
||||
for field in [Some(quad.subject()), Some(quad.predicate()), quad.graph()] {
|
||||
if let Some(Subject::Blank(ref blank_id)) = field {
|
||||
self.blank_node_to_quads
|
||||
.entry(blank_id.clone())
|
||||
.or_default()
|
||||
.insert(position);
|
||||
}
|
||||
let iter = [
|
||||
subject_as_blank_id::<N>(quad.subject()),
|
||||
object_as_blank_id::<N>(quad.object()),
|
||||
quad.graph().and_then(subject_as_blank_id::<N>),
|
||||
]
|
||||
.into_iter()
|
||||
.filter_map(|opt| opt);
|
||||
|
||||
for blank_id in iter {
|
||||
self.blank_node_to_quads
|
||||
.entry(blank_id.clone())
|
||||
.or_default()
|
||||
.insert(position);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -206,10 +238,15 @@ where
|
|||
}
|
||||
|
||||
// Step 6
|
||||
fn issue_complex_canonical_identifiers(&mut self)
|
||||
fn issue_complex_canonical_identifiers(
|
||||
&mut self,
|
||||
bail_on_large_inputs: bool,
|
||||
input_dataset: &InputDataset<N>,
|
||||
) -> Result<(), Security>
|
||||
where
|
||||
N: Default + BlankIdVocabularyMut + VocabularyMut,
|
||||
N::BlankId: Clone + Eq + Hash,
|
||||
N::BlankId: Clone + Eq + Hash + for<'a> Borrow<&'a N::BlankId>,
|
||||
S: Sha256,
|
||||
{
|
||||
let hash_to_blank_nodes =
|
||||
std::mem::replace(&mut self.hash_to_blank_nodes, Default::default());
|
||||
|
@ -225,6 +262,10 @@ where
|
|||
continue;
|
||||
}
|
||||
|
||||
if bail_on_large_inputs {
|
||||
return Err(Security);
|
||||
}
|
||||
|
||||
// step 6.2.2
|
||||
let mut temporary_issuer = make_issuer("_:b");
|
||||
|
||||
|
@ -240,11 +281,16 @@ where
|
|||
);
|
||||
|
||||
// step 6.2.4
|
||||
let hash = self.hash_n_degree_quads(
|
||||
identifier,
|
||||
&mut temporary_issuer,
|
||||
let hash = hash_n_degree_quads(
|
||||
&self.blank_node_to_quads,
|
||||
&mut temporary_vocabulary,
|
||||
&self.vocabulary,
|
||||
&mut temporary_issuer,
|
||||
&mut issued_identifier_list,
|
||||
&self.issued_identifier_list,
|
||||
identifier,
|
||||
input_dataset,
|
||||
&mut self.sha256,
|
||||
);
|
||||
|
||||
hash_path_list.push((hash, issued_identifier_list));
|
||||
|
@ -263,6 +309,8 @@ where
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Step 7
|
||||
|
@ -277,9 +325,9 @@ where
|
|||
// step 7.1
|
||||
let subject = self.translate_subject(quad.subject())?;
|
||||
|
||||
let predicate = self.translate_subject(quad.predicate())?;
|
||||
let predicate = quad.predicate().clone();
|
||||
|
||||
let object = quad.object().clone();
|
||||
let object = self.translate_object(quad.object())?;
|
||||
|
||||
let graph = if let Some(graph) = quad.graph() {
|
||||
Some(self.translate_subject(graph)?)
|
||||
|
@ -294,6 +342,17 @@ where
|
|||
OutputDataset { quads }
|
||||
}
|
||||
|
||||
fn translate_object(&self, object: &QuadValue<N>) -> Option<QuadValue<N>>
|
||||
where
|
||||
N::BlankId: Eq + Hash + Clone + for<'a> Borrow<&'a N::BlankId>,
|
||||
N::Iri: Clone,
|
||||
{
|
||||
match object {
|
||||
Value::Reference(subject) => Some(Value::Reference(self.translate_subject(subject)?)),
|
||||
Value::Literal(literal) => Some(Value::Literal(literal.clone())),
|
||||
}
|
||||
}
|
||||
|
||||
fn translate_subject(&self, subject: &QuadSubject<N>) -> Option<QuadSubject<N>>
|
||||
where
|
||||
N::BlankId: Eq + Hash + Clone + for<'a> Borrow<&'a N::BlankId>,
|
||||
|
@ -306,16 +365,248 @@ where
|
|||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn hash_n_degree_quads(
|
||||
&self,
|
||||
identifier: N::BlankId,
|
||||
issuer: &mut Blank,
|
||||
vocabulary: &mut N,
|
||||
issued_identifier_list: &mut IndexMap<N::BlankId, N::BlankId>,
|
||||
) -> HexHash {
|
||||
todo!()
|
||||
fn hash_n_degree_quads<N, S>(
|
||||
blank_node_to_quads: &HashMap<N::BlankId, HashSet<Position>>,
|
||||
vocabulary: &mut N,
|
||||
canon_vocabulary: &N,
|
||||
issuer: &mut Blank,
|
||||
issued_identifier_list: &mut IndexMap<N::BlankId, N::BlankId>,
|
||||
canon_issued_identifier_list: &IndexMap<N::BlankId, N::BlankId>,
|
||||
identifier: N::BlankId,
|
||||
input_dataset: &InputDataset<N>,
|
||||
sha256: &mut S,
|
||||
) -> HexHash
|
||||
where
|
||||
N: Vocabulary + VocabularyMut,
|
||||
N::BlankId: Clone + Eq + Hash + for<'a> Borrow<&'a N::BlankId>,
|
||||
S: Sha256,
|
||||
{
|
||||
// step 1
|
||||
let mut hash_to_related_blank_nodes: HashMap<HexHash, HashSet<N::BlankId>> = HashMap::new();
|
||||
|
||||
// step 2
|
||||
if let Some(quad_positions) = blank_node_to_quads.get(&&identifier) {
|
||||
// step 3
|
||||
for quad_position in quad_positions {
|
||||
let quad = input_dataset
|
||||
.get(*quad_position)
|
||||
.expect("Positions are created from the input dataset");
|
||||
|
||||
// step 3.1
|
||||
let iter = [
|
||||
("s", subject_as_blank_id::<N>(quad.subject())),
|
||||
("o", object_as_blank_id::<N>(quad.object())),
|
||||
("g", quad.graph().and_then(subject_as_blank_id::<N>)),
|
||||
]
|
||||
.into_iter()
|
||||
.filter_map(|(position, opt)| Some((position, opt?)))
|
||||
.filter(|(_, blank_id)| identifier != **blank_id);
|
||||
|
||||
for (position, related) in iter {
|
||||
// step 3.1.1
|
||||
let hash = hash_related_blank_node(
|
||||
blank_node_to_quads,
|
||||
canon_issued_identifier_list,
|
||||
canon_vocabulary,
|
||||
issued_identifier_list,
|
||||
vocabulary,
|
||||
related,
|
||||
quad,
|
||||
position,
|
||||
input_dataset,
|
||||
sha256,
|
||||
);
|
||||
|
||||
// step 3.1.2
|
||||
hash_to_related_blank_nodes
|
||||
.entry(hash)
|
||||
.or_default()
|
||||
.insert(related.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// step 4
|
||||
let mut data_to_hash = String::new();
|
||||
|
||||
// step 5
|
||||
for (related_hash, blank_node_list) in hash_to_related_blank_nodes {
|
||||
// step 5.1
|
||||
data_to_hash += &related_hash.0;
|
||||
|
||||
// step 5.2
|
||||
let mut chosen_path = String::new();
|
||||
|
||||
// step 5.3
|
||||
let mut chosen_issuer = Default::default();
|
||||
let mut chosen_issued_identifier_list = Default::default();
|
||||
|
||||
'permute: for permutation in permute(blank_node_list) {
|
||||
// step 5.4.1
|
||||
let mut issuer_copy = Blank::new_full(issuer.prefix().to_string(), issuer.count());
|
||||
let mut issued_identifier_list_copy = issued_identifier_list.clone();
|
||||
// step 5.4.2
|
||||
let mut path = String::new();
|
||||
// step 5.4.3
|
||||
let mut recursion_list = HashSet::new();
|
||||
|
||||
// step 5.4.4
|
||||
for related in permutation {
|
||||
if let Some(blank) = canon_issued_identifier_list.get(&related) {
|
||||
// step 5.4.4.1
|
||||
if let Some(blank_id) = canon_vocabulary.blank_id(blank) {
|
||||
path += &blank_id.to_string();
|
||||
} else {
|
||||
eprintln!("No blank in vocabulary");
|
||||
}
|
||||
} else {
|
||||
// step 5.4.4.2
|
||||
// step 5.4.4.2.1
|
||||
recursion_list.insert(related.clone());
|
||||
// step 5.4.4.2.2
|
||||
issue_identifier_algorithm(
|
||||
related,
|
||||
&mut issuer_copy,
|
||||
vocabulary,
|
||||
&mut issued_identifier_list_copy,
|
||||
);
|
||||
}
|
||||
|
||||
// step 5.4.4.3
|
||||
if !chosen_path.is_empty() && path.len() >= chosen_path.len() && path > chosen_path
|
||||
{
|
||||
continue 'permute;
|
||||
}
|
||||
}
|
||||
|
||||
// step 5.4.5
|
||||
for related in recursion_list {
|
||||
// step 5.4.5.1
|
||||
let result = hash_n_degree_quads(
|
||||
blank_node_to_quads,
|
||||
vocabulary,
|
||||
canon_vocabulary,
|
||||
&mut issuer_copy,
|
||||
&mut issued_identifier_list_copy,
|
||||
canon_issued_identifier_list,
|
||||
related.clone(),
|
||||
input_dataset,
|
||||
sha256,
|
||||
);
|
||||
// step 5.4.5.2
|
||||
let new_blank = issue_identifier_algorithm(
|
||||
related,
|
||||
&mut issuer_copy,
|
||||
vocabulary,
|
||||
&mut issued_identifier_list_copy,
|
||||
);
|
||||
|
||||
if let Some(blank_id) = vocabulary.blank_id(&new_blank) {
|
||||
path += &blank_id.to_string();
|
||||
|
||||
// step 5.4.5.3
|
||||
path += "<";
|
||||
path += result.0.as_str();
|
||||
path += ">";
|
||||
} else {
|
||||
eprintln!("No blank in vocabulary");
|
||||
}
|
||||
|
||||
// step 5.4.5.4 is a no-op
|
||||
|
||||
// step 5.4.5.5
|
||||
if !chosen_path.is_empty() && path.len() >= chosen_path.len() && path > chosen_path
|
||||
{
|
||||
continue 'permute;
|
||||
}
|
||||
}
|
||||
|
||||
if chosen_path.is_empty() || path < chosen_path {
|
||||
chosen_path = path;
|
||||
chosen_issuer = issuer_copy;
|
||||
chosen_issued_identifier_list = issued_identifier_list_copy;
|
||||
}
|
||||
}
|
||||
|
||||
// step 5.5
|
||||
data_to_hash += &chosen_path;
|
||||
|
||||
// step 5.6
|
||||
std::mem::swap(issuer, &mut chosen_issuer);
|
||||
std::mem::swap(issued_identifier_list, &mut chosen_issued_identifier_list);
|
||||
}
|
||||
|
||||
// step 6
|
||||
sha256.update(data_to_hash.as_bytes());
|
||||
sha256.finalize_hex_and_reset()
|
||||
}
|
||||
|
||||
fn permute<B>(set: HashSet<B>) -> impl Iterator<Item = Vec<B>>
|
||||
where
|
||||
B: Hash + Eq + Clone,
|
||||
{
|
||||
let len = set.len();
|
||||
|
||||
set.into_iter().permutations(len)
|
||||
}
|
||||
|
||||
fn hash_related_blank_node<N, S>(
|
||||
blank_node_to_quads: &HashMap<N::BlankId, HashSet<Position>>,
|
||||
canon_issued_identifier_list: &IndexMap<N::BlankId, N::BlankId>,
|
||||
canon_vocabulary: &N,
|
||||
issued_identifier_list: &IndexMap<N::BlankId, N::BlankId>,
|
||||
vocabulary: &N,
|
||||
related: &N::BlankId,
|
||||
quad: &NormalizingQuad<N>,
|
||||
position: &str,
|
||||
input_dataset: &InputDataset<N>,
|
||||
sha256: &mut S,
|
||||
) -> HexHash
|
||||
where
|
||||
N: Vocabulary,
|
||||
N::BlankId: Clone + Eq + Hash + for<'a> Borrow<&'a N::BlankId>,
|
||||
S: Sha256,
|
||||
{
|
||||
// step 1
|
||||
let identifier = if let Some(blank_id) = canon_issued_identifier_list.get(related) {
|
||||
let blank = canon_vocabulary
|
||||
.blank_id(blank_id)
|
||||
.expect("No blank in vocabulary");
|
||||
blank.to_string()
|
||||
} else if let Some(blank_id) = issued_identifier_list.get(related) {
|
||||
let blank = vocabulary
|
||||
.blank_id(blank_id)
|
||||
.expect("No blank in vocabulary");
|
||||
blank.to_string()
|
||||
} else {
|
||||
hash_first_degree_quads(
|
||||
blank_node_to_quads,
|
||||
canon_vocabulary,
|
||||
related.clone(),
|
||||
input_dataset,
|
||||
sha256,
|
||||
)
|
||||
.0
|
||||
};
|
||||
|
||||
// step 2
|
||||
let mut input = String::from(position);
|
||||
|
||||
// step 3
|
||||
if position != "g" {
|
||||
input += "<";
|
||||
input += &quad.predicate().with(canon_vocabulary).to_string();
|
||||
input += ">";
|
||||
}
|
||||
|
||||
// step 4
|
||||
input += &identifier;
|
||||
|
||||
// step 5
|
||||
sha256.update(input.as_bytes());
|
||||
sha256.finalize_hex_and_reset()
|
||||
}
|
||||
|
||||
fn hash_first_degree_quads<N, S>(
|
||||
|
@ -334,9 +625,9 @@ where
|
|||
let mut nquads = Vec::new();
|
||||
|
||||
// step 2
|
||||
if let Some(quads) = blank_node_to_quads.get(&identifier) {
|
||||
if let Some(quad_positions) = blank_node_to_quads.get(&identifier) {
|
||||
// step 3
|
||||
for quad_position in quads {
|
||||
for quad_position in quad_positions {
|
||||
let quad = input_dataset
|
||||
.get(*quad_position)
|
||||
.expect("Positions are created from the input dataset");
|
||||
|
@ -365,8 +656,8 @@ where
|
|||
N::BlankId: Clone + Eq,
|
||||
{
|
||||
let subject = serialize_subject(identifier, quad.subject(), vocabulary);
|
||||
let predicate = serialize_subject(identifier, quad.predicate(), vocabulary);
|
||||
let object = quad.object().with(vocabulary);
|
||||
let predicate = quad.predicate().with(vocabulary);
|
||||
let object = serialize_object(identifier, quad.object(), vocabulary);
|
||||
let graph = quad
|
||||
.graph()
|
||||
.map(|graph| serialize_subject(identifier, graph, vocabulary));
|
||||
|
@ -378,17 +669,36 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
fn serialize_subject<N>(identifier: &N::BlankId, subject: &QuadSubject<N>, vocabulary: &N) -> String
|
||||
fn serialize_subject<N>(
|
||||
identifier: &N::BlankId,
|
||||
subject: &QuadSubject<N>,
|
||||
vocabulary: &N,
|
||||
) -> Cow<'static, str>
|
||||
where
|
||||
N: Vocabulary,
|
||||
N::BlankId: Eq,
|
||||
{
|
||||
if subject.is_blank() && matches_identifier::<N>(identifier, subject) {
|
||||
String::from("_:a")
|
||||
Cow::Borrowed("_:a")
|
||||
} else if subject.is_blank() {
|
||||
String::from("_:z")
|
||||
Cow::Borrowed("_:z")
|
||||
} else {
|
||||
format!("{}", subject.with(vocabulary))
|
||||
Cow::Owned(subject.with(vocabulary).to_string())
|
||||
}
|
||||
}
|
||||
|
||||
fn serialize_object<N>(
|
||||
identifier: &N::BlankId,
|
||||
object: &QuadValue<N>,
|
||||
vocabulary: &N,
|
||||
) -> Cow<'static, str>
|
||||
where
|
||||
N: Vocabulary,
|
||||
N::BlankId: Eq,
|
||||
{
|
||||
match object {
|
||||
Value::Literal(lit) => Cow::Owned(lit.with(vocabulary).to_string()),
|
||||
Value::Reference(subject) => serialize_subject(identifier, subject, vocabulary),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -403,6 +713,26 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
fn subject_as_blank_id<N>(subject: &QuadSubject<N>) -> Option<&N::BlankId>
|
||||
where
|
||||
N: Vocabulary,
|
||||
{
|
||||
match subject {
|
||||
Subject::Blank(ref blank) => Some(blank),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn object_as_blank_id<N>(object: &QuadValue<N>) -> Option<&N::BlankId>
|
||||
where
|
||||
N: Vocabulary,
|
||||
{
|
||||
match object {
|
||||
Value::Reference(Subject::Blank(ref blank)) => Some(blank),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn issue_identifier_algorithm<N>(
|
||||
identifier: N::BlankId,
|
||||
generator: &mut Blank,
|
||||
|
@ -440,3 +770,11 @@ fn canonicalization_node_generator() -> Blank {
|
|||
fn make_issuer(prefix: &str) -> Blank {
|
||||
Blank::new_with_prefix(String::from(prefix))
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Security {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Aborted due to time complexity")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for Security {}
|
||||
|
|
Loading…
Reference in a new issue