/* * Copyright (C) 2020-2021 Matthieu Gautier * Copyright (C) 2021 Maneesh P M * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_ARCHIVE_H #define ZIM_ARCHIVE_H #include "zim.h" #include "entry.h" #include "uuid.h" #include #include #include #include #include namespace zim { class FileImpl; enum class EntryOrder { pathOrder, titleOrder, efficientOrder }; /** * The Archive class to access content in a zim file. * * The `Archive` is the main class to access content in a zim file. * `Archive` are lightweight object and can be copied easily. * * An `Archive` is read-only, and internal states (as caches) are protected * from race-condition. Therefore, all methods of `Archive` are threadsafe. * * All methods of archive may throw an `ZimFileFormatError` if the file is invalid. */ class LIBZIM_API Archive { public: template class EntryRange; template class iterator; /** Archive constructor. * * Construct an archive from a filename. * The file is open readonly. * * The filename is the "logical" path. * So if you want to open a split zim file (foo.zimaa, foo.zimab, ...) * you must pass the `foo.zim` path. * * @param fname The filename to the file to open (utf8 encoded) */ explicit Archive(const std::string& fname); #ifndef _WIN32 /** Archive constructor. * * Construct an archive from a file descriptor. * * Note: This function is not available under Windows. * * @param fd The descriptor of a seekable file representing a ZIM archive */ explicit Archive(int fd); /** Archive constructor. * * Construct an archive from a descriptor of a file with an embedded ZIM * archive inside. * * Note: This function is not available under Windows. * * @param fd The descriptor of a seekable file with a continuous segment * representing a complete ZIM archive. * @param offset The offset of the ZIM archive relative to the beginning * of the file (rather than the current position associated with fd). * @param size The size of the ZIM archive. */ Archive(int fd, offset_type offset, size_type size); #endif /** Return the filename of the zim file. * * Return the filename as passed to the constructor * (So foo.zim). * * @return The logical filename of the archive. */ const std::string& getFilename() const; /** Return the logical archive size. * * Return the size of the full archive, not the size of the file on the fs. * If the zim is split, return the sum of the size of the parts. * * @return The logical size of the archive. */ size_type getFilesize() const; /** Return the number of entries in the archive. * * Return the total number of entries in the archive, including * internal entries created by libzim itself, metadata, indexes, ... * * @return the number of all entries in the archive. */ entry_index_type getAllEntryCount() const; /** Return the number of user entries in the archive. * * If the notion of "user entries" doesn't exist in the zim archive, * returns `getAllEntryCount()`. * * @return the number of user entries in the archive. */ entry_index_type getEntryCount() const; /** Return the number of articles in the archive. * * The definition of "article" depends of the zim archive. * On recent archives, this correspond to all entries marked as "FRONT_ARTICLE" * at creaton time. * On old archives, this corresponds to all "text/html*" entries. * * @return the number of articles in the archive. */ entry_index_type getArticleCount() const; /** Return the number of media in the archive. * * This definition of "media" is based on the mimetype. * * @return the number of media in the archive. */ entry_index_type getMediaCount() const; /** The uuid of the archive. * * @return the uuid of the archive. */ Uuid getUuid() const; /** Get a specific metadata content. * * Get the content of a metadata stored in the archive. * * @param name The name of the metadata. * @return The content of the metadata. * @exception EntryNotFound If the metadata is not in the arcthive. */ std::string getMetadata(const std::string& name) const; /** Get a specific metadata item. * * Get the item associated to a metadata stored in the archive. * * @param name The name of the metadata. * @return The item associated to the metadata. * @exception EntryNotFound If the metadata in not in the archive. */ Item getMetadataItem(const std::string& name) const; /** Get the list of metadata stored in the archive. * * @return The list of metadata in the archive. */ std::vector getMetadataKeys() const; /** Get the illustration item of the archive. * * Illustration is a icon for the archive that can be used in catalog and so to illustrate the archive. * * @param size The size (width and height) of the illustration to get. Default to 48 (48x48px icon) * @return The illustration item. * @exception EntryNotFound If no illustration item can be found. */ Item getIllustrationItem(unsigned int size=48) const; /** Return a list of available sizes (width) for the illustations in the archive. * * Illustration is an icon for the archive that can be used in catalog and elsewehere to illustrate the archive. * An Archive may contains several illustrations with different size. * This method allows to know which illustration are in the archive (by size: width) * * @return A set of size. */ std::set getIllustrationSizes() const; /** Get an entry using its "path" index. * * Use the index of the entry to get the idx'th entry * (entry being sorted by path). * * @param idx The index of the entry. * @return The Entry. * @exception std::out_of_range If idx is greater than the number of entry. */ Entry getEntryByPath(entry_index_type idx) const; /** Get an entry using a path. * * Get an entry using its path. * The path must contains the namespace. * * @param path The entry's path. * @return The Entry. * @exception EntryNotFound If no entry has the asked path. */ Entry getEntryByPath(const std::string& path) const; /** Get an entry using its "title" index. * * Use the index of the entry to get the idx'th entry * (entry being sorted by title). * * @param idx The index of the entry. * @return The Entry. * @exception std::out_of_range If idx is greater than the number of entry. */ Entry getEntryByTitle(entry_index_type idx) const; /** Get an entry using a title. * * Get an entry using its path. * * @param title The entry's title. * @return The Entry. * @exception EntryNotFound If no entry has the asked title. */ Entry getEntryByTitle(const std::string& title) const; /** Get an entry using its "cluster" index. * * Use the index of the entry to get the idx'th entry * The actual order of the entry is not really specified. * It is infered from the internal way the entry are stored. * * This method is probably not relevent and is provided for completeness. * You should probably use a iterator using the `efficientOrder`. * * @param idx The index of the entry. * @return The Entry. * @exception std::out_of_range If idx is greater than the number of entry. */ Entry getEntryByClusterOrder(entry_index_type idx) const; /** Get the main entry of the archive. * * @return The Main entry. * @exception EntryNotFound If no main entry has been specified in the archive. */ Entry getMainEntry() const; /** Get a random entry. * * The entry is picked randomly from the front artice list. * * @return A random entry. * @exception EntryNotFound If no valid random entry can be found. */ Entry getRandomEntry() const; /** Check in an entry has path in the archive. * * @param path The entry's path. * @return True if the path in the archive, false else. */ bool hasEntryByPath(const std::string& path) const { try{ getEntryByPath(path); return true; } catch(...) { return false; } } /** Check in an entry has title in the archive. * * @param title The entry's title. * @return True if the title in the archive, false else. */ bool hasEntryByTitle(const std::string& title) const { try{ getEntryByTitle(title); return true; } catch(...) { return false; } } /** Check if archive has a main entry * * @return True if the archive has a main entry. */ bool hasMainEntry() const; /** Check if archive has a favicon entry * * @param size The size (width and height) of the illustration to check. Default to 48 (48x48px icon) * @return True if the archive has a corresponding illustration entry. * (Always True if the archive has no illustration, but a favicon) */ bool hasIllustration(unsigned int size=48) const; /** Check if the archive has a fulltext index. * * @return True if the archive has a fulltext index */ bool hasFulltextIndex() const; /** Check if the archive has a title index. * * @return True if the archive has a title index */ bool hasTitleIndex() const; /** Get a "iterable" by path order. * * This method allow to iterate on all user entries using a path order. * If the notion of "user entries" doesn't exists (for old zim archive), * this iterate on all entries in the zim file. * * ``` * for(auto& entry:archive.iterByPath()) { * ... * } * ``` * * @return A range on all the entries, in path order. */ EntryRange iterByPath() const; /** Get a "iterable" by title order. * * This method allow to iterate on all articles using a title order. * The definition of "article" depends of the zim archive. * On recent archives, this correspond to all entries marked as "FRONT_ARTICLE" * at creaton time. * On old archives, this correspond to all entries in 'A' namespace. * Few archives may have been created without namespace but also without specific * article listing. In this case, this iterate on all user entries. * * ``` * for(auto& entry:archive.iterByTitle()) { * ... * } * ``` * * @return A range on all the entries, in title order. */ EntryRange iterByTitle() const; /** Get a "iterable" by a efficient order. * * This method allow to iterate on all user entries using a effictient order. * If the notion of "user entries" doesn't exists (for old zim archive), * this iterate on all entries in the zim file. * * ``` * for(auto& entry:archive.iterEfficient()) { * ... * } * ``` * * @return A range on all the entries, in efficitent order. */ EntryRange iterEfficient() const; /** Find a range of entries starting with path. * * The path is the "long path". (Ie, with the namespace) * * @param path The path prefix to search for. * @return A range starting from the first entry starting with path * and ending past the last entry. * If no entry starts with `path`, begin == end. */ EntryRange findByPath(std::string path) const; /** Find a range of entry starting with title. * * The entry title is search in `A` namespace. * * @param title The title prefix to search for. * @return A range starting from the first entry starting with title * and ending past the last entry. * If no entry starts with `title`, begin == end. */ EntryRange findByTitle(std::string title) const; /** hasChecksum. * * The checksum is not the checksum of the file. * It is an internal checksum stored in the zim file. * * @return True if the archive has a checksum. */ bool hasChecksum() const; /** getChecksum. * * @return the checksum stored in the archive. * If the archive has no checksum return an empty string. */ std::string getChecksum() const; /** Check that the zim file is valid (in regard to its checksum). * * If the zim file has no checksum return false. * * @return True if the file is valid. */ bool check() const; /** Check the integrity of the zim file. * * Run different type of checks to verify the zim file is valid * (in regard to the zim format). * This may be time consuming. * * @return True if the file is valid. */ bool checkIntegrity(IntegrityCheck checkType); /** Check if the file is split in the filesystem. * * @return True if the archive is split in different file (foo.zimaa, foo.zimbb). */ bool isMultiPart() const; /** Get if the zim archive uses the new namespace scheme. * * Recent zim file use the new namespace scheme. * * On user perspective, it means that : * - On old namespace scheme : * . All entries are accessible, either using `getEntryByPath` with a specific namespace * or simply iterating over the entries (with `iter*` methods). * . Entry's path has namespace included ("A/foo.html") * - On new namespace scheme : * . Only the "user" entries are accessible with `getEntryByPath` and `iter*` methods. * To access metadatas, use `getMetadata` method. * . Entry's path do not contains namespace ("foo.html") */ bool hasNewNamespaceScheme() const; /** Get a shared ptr on the FileImpl * * @internal * @return The shared_ptr */ std::shared_ptr getImpl() const { return m_impl; } #ifdef ZIM_PRIVATE cluster_index_type getClusterCount() const; offset_type getClusterOffset(cluster_index_type idx) const; entry_index_type getMainEntryIndex() const; #endif private: std::shared_ptr m_impl; }; template LIBZIM_API entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); template<> LIBZIM_API entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); template<> LIBZIM_API entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); template<> LIBZIM_API entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); /** * A range of entries in an `Archive`. * * `EntryRange` represents a range of entries in a specific order. * * An `EntryRange` can't be modified is consequently threadsafe. */ template class LIBZIM_API Archive::EntryRange { public: explicit EntryRange(const std::shared_ptr file, entry_index_type begin, entry_index_type end) : m_file(file), m_begin(begin), m_end(end) {} iterator begin() const { return iterator(m_file, entry_index_type(m_begin)); } iterator end() const { return iterator(m_file, entry_index_type(m_end)); } int size() const { return m_end - m_begin; } EntryRange offset(int start, int maxResults) const { auto begin = m_begin + start; if (begin > m_end) { begin = m_end; } auto end = m_end; if (begin + maxResults < end) { end = begin + maxResults; } return EntryRange(m_file, begin, end); } private: std::shared_ptr m_file; entry_index_type m_begin; entry_index_type m_end; }; /** * An iterator on an `Archive`. * * `Archive::iterator` stores an internal state which is not protected * from race-condition. It is not threadsafe. * * An `EntryRange` can't be modified and is consequently threadsafe. * * Be aware that the referenced/pointed Entry is generated and stored * in the iterator itself. * Once the iterator is destructed or incremented/decremented, you must NOT * use the Entry. */ template class LIBZIM_API Archive::iterator { public: /* SuggestionIterator is conceptually a bidirectional iterator. * But std *LegayBidirectionalIterator* is also a *LegacyForwardIterator* and * it would impose us that : * > Given a and b, dereferenceable iterators of type It: * > If a and b compare equal (a == b is contextually convertible to true) * > then either they are both non-dereferenceable or *a and *b are references bound to the same object. * and * > the LegacyForwardIterator requirements requires dereference to return a reference. * Which cannot be as we create the entry on demand. * * So we are stick with declaring ourselves at `input_iterator`. */ using iterator_category = std::input_iterator_tag; using value_type = Entry; using pointer = Entry*; using reference = Entry&; explicit iterator(const std::shared_ptr file, entry_index_type idx) : m_file(file), m_idx(idx), m_entry(nullptr) {} iterator(const iterator& other) : m_file(other.m_file), m_idx(other.m_idx), m_entry(other.m_entry?new Entry(*other.m_entry):nullptr) {} bool operator== (const iterator& it) const { return m_file == it.m_file && m_idx == it.m_idx; } bool operator!= (const iterator& it) const { return !operator==(it); } iterator& operator=(iterator&& it) = default; iterator& operator=(iterator& it) { m_entry.reset(); m_idx = it.m_idx; m_file = it.m_file; return *this; } iterator& operator++() { ++m_idx; m_entry.reset(); return *this; } iterator operator++(int) { auto it = *this; operator++(); return it; } iterator& operator--() { --m_idx; m_entry.reset(); return *this; } iterator operator--(int) { auto it = *this; operator--(); return it; } const Entry& operator*() const { if (!m_entry) { m_entry.reset(new Entry(m_file, _toPathOrder(*m_file, m_idx))); } return *m_entry; } const Entry* operator->() const { operator*(); return m_entry.get(); } private: std::shared_ptr m_file; entry_index_type m_idx; mutable std::unique_ptr m_entry; }; /** * The set of the integrity checks to be performed by `zim::validate()`. */ typedef std::bitset IntegrityCheckList; /** Check the integrity of the zim file. * * Run the specified checks to verify the zim file is valid * (with regard to the zim format). Some checks can be quite slow. * * @param zimPath The path of the ZIM archive to be checked. * @param checksToRun The set of checks to perform. * @return False if any check fails, true otherwise. */ bool validate(const std::string& zimPath, IntegrityCheckList checksToRun); } #endif // ZIM_ARCHIVE_H