Implement hash_first_degree_quads

This commit is contained in:
asonix 2022-12-10 18:49:29 -06:00
parent 0cfa8376e9
commit b4bd3d6781

View file

@ -1,3 +1,4 @@
use contextual::WithContext;
use indexmap::IndexMap;
use iref::IriBuf;
use json_ld::{ExpandedDocument, ValidId as Subject};
@ -27,10 +28,6 @@ pub trait Sha256 {
fn finalize_hex_and_reset(&mut self) -> HexHash;
}
pub trait Hashable<Context> {
fn as_bytes_with(&self, context: Context) -> &[u8];
}
type Expanded<N> = Meta<
ExpandedDocument<
<N as IriVocabulary>::Iri,
@ -49,10 +46,12 @@ where
pub quads: Vec<NormalizingQuad<N>>,
}
pub struct CanonicalizationState<N>
pub struct CanonicalizationState<N, S>
where
N: Vocabulary,
{
sha256: S,
// Identifier Prefix and Identifier Counter
blank_node_generator: Blank,
vocabulary: N,
@ -64,13 +63,17 @@ where
hash_to_blank_nodes: BTreeMap<HexHash, HashSet<N::BlankId>>,
}
impl<N> CanonicalizationState<N>
impl<N, S> CanonicalizationState<N, S>
where
N: Vocabulary,
{
/// Step 1
pub fn new(vocabulary: N) -> Self {
pub fn new(vocabulary: N) -> Self
where
S: Default,
{
Self {
sha256: S::default(),
blank_node_generator: canonicalization_node_generator(),
vocabulary,
issued_identifier_list: Default::default(),
@ -81,6 +84,7 @@ where
pub fn normalize(&mut self, document_id: IriBuf, expanded: Expanded<N>) -> OutputDataset<N>
where
S: Sha256,
N: VocabularyMut + Default,
N::Iri: Clone + Eq + Hash + Send + Sync,
N::BlankId: Clone + Eq + Hash + Send + Sync + for<'a> Borrow<&'a N::BlankId>,
@ -91,7 +95,7 @@ where
self.find_blank_nodes(&input_dataset);
// Step 3, 4, and 5
self.issue_simple_canonical_identifiers();
self.issue_simple_canonical_identifiers(&input_dataset);
// Step 6
self.issue_complex_canonical_identifiers();
@ -131,8 +135,9 @@ where
}
// Step 3, 4, and 5
fn issue_simple_canonical_identifiers(&mut self)
fn issue_simple_canonical_identifiers(&mut self, input_dataset: &InputDataset<N>)
where
S: Sha256,
N::BlankId: Clone + Eq + Hash,
N: VocabularyMut + BlankIdVocabularyMut,
{
@ -154,7 +159,13 @@ where
// step 5.3
for identifier in non_normalized_identifiers.iter() {
// step 5.3.1
let hash = self.hash_first_degree_quads((*identifier).clone());
let hash = hash_first_degree_quads(
&self.blank_node_to_quads,
&self.vocabulary,
(*identifier).clone(),
input_dataset,
&mut self.sha256,
);
// step 5.3.2
self.hash_to_blank_nodes
@ -296,10 +307,6 @@ where
}
}
fn hash_first_degree_quads(&self, identifier: N::BlankId) -> HexHash {
todo!()
}
fn hash_n_degree_quads(
&self,
identifier: N::BlankId,
@ -311,6 +318,91 @@ where
}
}
fn hash_first_degree_quads<N, S>(
blank_node_to_quads: &HashMap<N::BlankId, HashSet<Position>>,
vocabulary: &N,
identifier: N::BlankId,
input_dataset: &InputDataset<N>,
sha256: &mut S,
) -> HexHash
where
N: Vocabulary,
N::BlankId: Eq + Hash + Clone,
S: Sha256,
{
// Step 1
let mut nquads = Vec::new();
// step 2
if let Some(quads) = blank_node_to_quads.get(&identifier) {
// step 3
for quad_position in quads {
let quad = input_dataset
.get(*quad_position)
.expect("Positions are created from the input dataset");
// step 3.1, 3.1.1, and 3.1.1.1
let serizlied = serialize_quad(&identifier, quad, vocabulary);
nquads.push(serizlied);
}
}
// step 4
nquads.sort();
// step 5
let joined = nquads.join("");
sha256.update(joined.as_bytes());
sha256.finalize_hex_and_reset()
}
fn serialize_quad<N>(identifier: &N::BlankId, quad: &NormalizingQuad<N>, vocabulary: &N) -> String
where
N: Vocabulary,
N::BlankId: Clone + Eq,
{
let subject = serialize_subject(identifier, quad.subject(), vocabulary);
let predicate = serialize_subject(identifier, quad.predicate(), vocabulary);
let object = quad.object().with(vocabulary);
let graph = quad
.graph()
.map(|graph| serialize_subject(identifier, graph, vocabulary));
if let Some(graph) = graph {
format!("{subject} {predicate} {object} {graph}")
} else {
format!("{subject} {predicate} {object}")
}
}
fn serialize_subject<N>(identifier: &N::BlankId, subject: &QuadSubject<N>, vocabulary: &N) -> String
where
N: Vocabulary,
N::BlankId: Eq,
{
if subject.is_blank() && matches_identifier::<N>(identifier, subject) {
String::from("_:a")
} else if subject.is_blank() {
String::from("_:z")
} else {
format!("{}", subject.with(vocabulary))
}
}
fn matches_identifier<N>(identifier: &N::BlankId, subject: &QuadSubject<N>) -> bool
where
N: Vocabulary,
N::BlankId: Eq,
{
match subject {
Subject::Blank(blank) => blank == identifier,
Subject::Iri(_) => false,
}
}
fn issue_identifier_algorithm<N>(
identifier: N::BlankId,
generator: &mut Blank,
@ -348,48 +440,3 @@ fn canonicalization_node_generator() -> Blank {
fn make_issuer(prefix: &str) -> Blank {
Blank::new_with_prefix(String::from(prefix))
}
impl<'a, T, Context> Hashable<Context> for &'a T
where
T: Hashable<Context>,
{
fn as_bytes_with(&self, context: Context) -> &[u8] {
T::as_bytes_with(self, context)
}
}
impl<'a, T, Context> Hashable<Context> for &'a mut T
where
T: Hashable<Context>,
{
fn as_bytes_with(&self, context: Context) -> &[u8] {
T::as_bytes_with(self, context)
}
}
impl<T, Context> Hashable<Context> for Box<T>
where
T: Hashable<Context>,
{
fn as_bytes_with(&self, context: Context) -> &[u8] {
T::as_bytes_with(self, context)
}
}
impl<T, Context> Hashable<Context> for std::rc::Rc<T>
where
T: Hashable<Context>,
{
fn as_bytes_with(&self, context: Context) -> &[u8] {
T::as_bytes_with(self, context)
}
}
impl<T, Context> Hashable<Context> for std::sync::Arc<T>
where
T: Hashable<Context>,
{
fn as_bytes_with(&self, context: Context) -> &[u8] {
T::as_bytes_with(self, context)
}
}