1212#include < myhtml/serialization.h>
1313#include < mycss/selectors/serialization.h>
1414#include < modest/finder/finder.h>
15+ #include < fmt/core.h>
1516
1617using namespace std ;
1718
@@ -27,6 +28,9 @@ Usage: %s [options] <selector> <mode> [mode argument]
2728 delimiter character to use between results (defaults to newline)
2829 -0, --null
2930 uses \0 as delimiter
31+ -F, --format <selector> <format string>
32+ specify custom format string for element stringification (can be specified multiple times)
33+ example: `-F a '->{}<-'` - renders <a> text wrapped in '->' and '<-'
3034
3135 <selector>
3236 CSS selector to match against
@@ -53,11 +57,18 @@ static const string afmt_e = "m";
5357static const vector<char > collapsible = {' ' , ' \t ' , ' \n ' , ' \r ' };
5458static const vector<myhtml_tag_id_t > breaking = {
5559 MyHTML_TAG_BR,
56- MyHTML_TAG_P
60+ MyHTML_TAG_P,
61+ MyHTML_TAG_H1,
62+ MyHTML_TAG_H2,
63+ MyHTML_TAG_H3,
64+ MyHTML_TAG_H4,
65+ MyHTML_TAG_H5,
66+ MyHTML_TAG_H6,
67+ MyHTML_TAG_HR,
5768};
5869
59- static map<const string, bool > flags = {
60- {" dirtyargs" , false }
70+ static map<const string, int > flags = {
71+ {" dirtyargs" , 0 }
6172};
6273
6374static map<const string, string> state = { // global state
@@ -67,15 +78,16 @@ static map<const string, string> state = { // global state
6778 {" selector" , " " }, // matching selector
6879 {" mode" , " " }, // output mode
6980 {" data" , " " }, // read input data
70- {" modearg" , " " } // mode argument (optional)
81+ {" modearg" , " " }, // mode argument (optional)
82+ {" scratch" , " " }, // scratchpad value (internal use)
7183};
7284
7385bool readarg (int &argc, const char ** &argv, string argname, const bool die_on_err = true ){
7486 if (argc > 1 ){
75- state[argname] = argv[1 ];
7687 argv++;
7788 argc--;
78- flags[" dirtyargs" ] = true ;
89+ state[argname] = *argv;
90+ flags[" dirtyargs" ]++;
7991 return true ;
8092 }else {
8193 if (die_on_err){
@@ -110,6 +122,10 @@ template <typename ...T> inline bool node_in(myhtml_tree_node_t* node, T... tags
110122 return false ;
111123}
112124
125+ bool node_sort (myhtml_tree_node_t * lhs, myhtml_tree_node_t * rhs){
126+ return myhtml_node_element_position (lhs).begin < myhtml_node_element_position (rhs).begin ;
127+ }
128+
113129template <typename ...T> inline bool node_before (myhtml_tree_node_t * node, T... tags){
114130 while ((node = node->next ) && node->tag_id <= 0x003 );
115131
@@ -134,9 +150,21 @@ static map<const char, const string> option_longopts = { // maps shortopts to lo
134150 {' h' , " help" },
135151 {' f' , " file" },
136152 {' d' , " delimiter" },
137- {' 0' , " zero" }
153+ {' 0' , " zero" },
154+ {' F' , " format" },
138155};
139156
157+ vector<tuple<string, string, myhtml_collection_t *>> selector_format = {};
158+
159+ const char * format_node (myhtml_tree_node_t * node){
160+ for (auto & [fselect, fstr, fcollect] : selector_format)
161+ if (fcollect)
162+ for (myhtml_tree_node_t * select_node : vector<myhtml_tree_node_t *>(fcollect->list , fcollect->list +fcollect->length ))
163+ if (node == select_node) return fstr.c_str ();
164+
165+ return " {}" ;
166+ }
167+
140168static map<const string, const function<void (int &, const char **&)>> option_handlers = { // maps longopts to functions
141169 {" help" , [](int &argc, const char ** &argv) {
142170 fprintf (stderr, helptext, state[" progname" ].c_str (), state[" progname" ].c_str (), state[" progname" ].c_str ());
@@ -152,7 +180,29 @@ static map<const string, const function<void(int&, const char**&)>> option_handl
152180 }},
153181 {" zero" , [](int &argc, const char ** &argv) {
154182 state[" delim" ] = " \0 " ;
155- }}
183+ }},
184+ {" format" , [](int &argc, const char ** &argv) {
185+ argv++, argc--;
186+ if (!readarg (argc, argv, " scratch" , false )){
187+ cerr << " missing selector in --format" << endl;
188+ exit (EXIT_FAILURE);
189+ }
190+ string fselect = state[" scratch" ];
191+ if (!readarg (argc, argv, " scratch" , false )){
192+ cerr << " missing format string in --format" << endl;
193+ exit (EXIT_FAILURE);
194+ }
195+ string form = state[" scratch" ];
196+
197+ if (fselect.length () == 0 ){
198+ cerr << " invalid --format " << fselect << " " << form << endl;
199+ exit (EXIT_FAILURE);
200+ }
201+
202+ selector_format.push_back (tuple<string, string, myhtml_collection_t *>(fselect, form, nullptr ));
203+
204+ argv--, argc++;
205+ }},
156206};
157207
158208static pair<const function<void (myhtml_tree_node_t *, string&)>, const function<void (myhtml_tree_node_t *, string&)>> format_handlers = { // {format, unformat}
@@ -169,6 +219,12 @@ static pair<const function<void(myhtml_tree_node_t*, string&)>, const function<v
169219 case MyHTML_TAG_I: // italics on
170220 case MyHTML_TAG_U:
171221 case MyHTML_TAG_EM:
222+ case MyHTML_TAG_H1:
223+ case MyHTML_TAG_H2:
224+ case MyHTML_TAG_H3:
225+ case MyHTML_TAG_H4:
226+ case MyHTML_TAG_H5:
227+ case MyHTML_TAG_H6:
172228 if (ansi) rendered += afmt_s + " 4" + afmt_e;
173229 if (md) rendered += " _" ;
174230 break ;
@@ -201,6 +257,12 @@ static pair<const function<void(myhtml_tree_node_t*, string&)>, const function<v
201257 case MyHTML_TAG_I: // italics off
202258 case MyHTML_TAG_U:
203259 case MyHTML_TAG_EM:
260+ case MyHTML_TAG_H1:
261+ case MyHTML_TAG_H2:
262+ case MyHTML_TAG_H3:
263+ case MyHTML_TAG_H4:
264+ case MyHTML_TAG_H5:
265+ case MyHTML_TAG_H6:
204266 if (ansi) rendered += afmt_s + " 24" + afmt_e; // no italics here :(
205267 if (md) rendered += " _" ;
206268 break ;
@@ -225,6 +287,11 @@ static pair<const function<void(myhtml_tree_node_t*, string&)>, const function<v
225287 rendered += " \t " ;
226288 }
227289 break ;
290+ case MyHTML_TAG_TR:
291+ if (rendered.back () != ' \n ' ){
292+ rendered += " \n " ;
293+ }
294+ break ;
228295 }
229296
230297 if (vec_has (breaking, node_iter->tag_id )){ // <br/>
@@ -233,56 +300,57 @@ static pair<const function<void(myhtml_tree_node_t*, string&)>, const function<v
233300 }
234301};
235302
303+ string render_node (myhtml_tree_node_t * node_iter){
304+ string rendered = " " ;
305+
306+ if (node_iter->tag_id == MyHTML_TAG_STYLE) return rendered;
307+
308+ format_handlers.first (node_iter, rendered);
309+
310+ if (node_iter->tag_id == MyHTML_TAG__TEXT){
311+ string text (myhtml_node_text (node_iter, nullptr ));
312+ if (!node_in (node_iter, MyHTML_TAG_PRE)){
313+ // collapse whitespace to single character
314+ string::iterator nend = unique (text.begin (), text.end (), [](char c1, char c2) -> bool {
315+ return vec_has (collapsible, c1) && vec_has (collapsible, c2);
316+ });
317+ text.resize (static_cast <unsigned long >(nend-text.begin ()));
318+
319+ // replace whitespace with space
320+ replace_if (text.begin (), text.end (), [](char c) -> bool {
321+ return vec_has (collapsible, c);
322+ }, ' ' );
323+ }
324+
325+ rendered += text;
326+ }
327+
328+ if (node_iter->child ){
329+ rendered += render_node (node_iter->child );
330+ }
331+
332+ rendered = fmt::format (format_node (node_iter), rendered);
333+
334+ format_handlers.second (node_iter, rendered);
335+
336+ if ((node_iter = node_iter->next )){
337+ rendered += render_node (node_iter);
338+ }
339+
340+ return rendered;
341+ }
342+
236343static map<const string, const function<void (myhtml_tree_node_t *)>> mode_handlers = { // maps modes to functions
237344 {" data" , [](myhtml_tree_node_t * node) {
238345 myhtml_serialization_tree_callback (node, [](const char * data, size_t len, void * ctx) -> unsigned int {
239- printf (" %.*s " , static_cast < int >(len) , data);
346+ printf (" %s " , data);
240347 return 0 ;
241- }, nullptr );
348+ }, node );
242349 printf (" %c" , state[" delim" ][0 ]);
243350 }},
244351
245352 {" text" , [](myhtml_tree_node_t * node) {
246- string rendered = " " ;
247-
248- myhtml_tree_node_t * node_iter = node->child ;
249- while (node_iter){
250- const char * text_c = myhtml_node_text (node_iter, nullptr );
251- string text = " " ;
252- if (text_c != nullptr ) text += text_c;
253-
254- if (node_iter->tag_id == MyHTML_TAG__TEXT){
255- if (!node_in (node_iter, MyHTML_TAG_PRE)){
256- // collapse whitespace to single character
257- string::iterator nend = unique (text.begin (), text.end (), [](char c1, char c2) -> bool {
258- return vec_has (collapsible, c1) && vec_has (collapsible, c2);
259- });
260- text.resize (static_cast <unsigned long >(nend-text.begin ()));
261-
262- // replace whitespace with space
263- replace_if (text.begin (), text.end (), [](char c) -> bool {
264- return vec_has (collapsible, c);
265- }, ' ' );
266- }
267-
268- rendered += text;
269- }else {
270- format_handlers.first (node_iter, rendered);
271- }
272-
273- if (node_iter->child ) node_iter = node_iter->child ;
274- else {
275- while (node_iter != node && node_iter->next == nullptr ){
276- format_handlers.second (node_iter, rendered);
277-
278- node_iter = node_iter->parent ;
279- }
280- if (node_iter == node) break ;
281-
282- format_handlers.second (node_iter, rendered);
283- node_iter = node_iter->next ;
284- }
285- }
353+ string rendered = render_node (node->child );
286354
287355 size_t index = 0 ;
288356 while ((index = rendered.find (" \n " , index)) != string::npos){ // clear whitespace before multiline content
@@ -296,7 +364,8 @@ static map<const string, const function<void(myhtml_tree_node_t*)>> mode_handler
296364 while (vec_has (collapsible, rendered[0 ])) rendered.erase (0 , 1 ); // clear whitespace before single-line content
297365 while (vec_has (collapsible, *(rendered.end ()-1 ))) rendered.erase (rendered.length ()-1 , 1 ); // clear whitespace after single-line content
298366
299- cout << rendered;
367+ fmt::print (format_node (node), rendered);
368+ // printf(fmt, rendered);
300369 printf (" %c" , state[" delim" ][0 ]);
301370 }},
302371
@@ -314,7 +383,7 @@ static map<const string, const function<void(myhtml_tree_node_t*)>> mode_handler
314383
315384 do {
316385 if (state[" modearg" ] == mycore_string_data (&attr->key )){
317- cout << mycore_string_data (&attr->value );
386+ fmt::print ( format_node (node), mycore_string_data (&attr->value ) );
318387 printf (" %c" , state[" delim" ][0 ]);
319388 }
320389 }while (attr != token->attr_last && (attr = attr->next )); // move attr pointer further & loop if attr_last not hit
@@ -343,8 +412,8 @@ void parseopts(int &argc, const char** &argv){
343412 cerr << " invalid short option '-" << argv[1 ][0 ] << " '" << endl;
344413 exit (EXIT_FAILURE);
345414 }
346- if (flags[" dirtyargs" ]){ // option handler touched argv (args?); skip
347- flags[" dirtyargs" ] = false ;
415+ if (flags[" dirtyargs" ] > 0 ){ // option handler touched argv (args?); skip
416+ flags[" dirtyargs" ]-- ;
348417 break ;
349418 }
350419 }
@@ -406,9 +475,25 @@ int main(int argc, const char* argv[]){
406475 myhtml_collection_t * collection = nullptr ;
407476 modest_finder_by_selectors_list (finder, html_tree->node_html , selectors_list, &collection);
408477
478+ for (auto & [fselect, fstr, fcollect] : selector_format){
479+ mycss_selectors_list_t * fselect_parsed = mycss_selectors_parse (
480+ mycss_entry_selectors (css_entry),
481+ MyENCODING_UTF_8,
482+ fselect.c_str (), fselect.length (),
483+ &mystatus
484+ );
485+ if (fselect_parsed == nullptr || (fselect_parsed->flags & MyCSS_SELECTORS_FLAGS_SELECTOR_BAD)){
486+ cerr << " bad format selector '" << fselect << " '" << endl;
487+ exit (EXIT_FAILURE);
488+ }
489+ modest_finder_by_selectors_list (finder, html_tree->node_html , fselect_parsed, &fcollect);
490+ }
491+
409492 if (collection){
493+ vector<myhtml_tree_node_t *> nodes (collection->list , collection->list +collection->length );
494+ sort (nodes.begin (), nodes.end (), node_sort);
410495 try {
411- for (myhtml_tree_node_t * node : vector< myhtml_tree_node_t *>(collection-> list , collection-> list +collection-> length ) ){
496+ for (myhtml_tree_node_t * node : nodes ){
412497 mode_handlers[state[" mode" ]](node);
413498 }
414499 }catch (bad_function_call&){
0 commit comments